github.com/matrixorigin/matrixone@v1.2.0/pkg/logservice/client.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package logservice
    16  
    17  import (
    18  	"context"
    19  	"math/rand"
    20  	"sync"
    21  	"time"
    22  
    23  	"go.uber.org/zap"
    24  
    25  	"github.com/cockroachdb/errors"
    26  	"github.com/lni/dragonboat/v4"
    27  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    28  	"github.com/matrixorigin/matrixone/pkg/common/morpc"
    29  	"github.com/matrixorigin/matrixone/pkg/common/mpool"
    30  	"github.com/matrixorigin/matrixone/pkg/logutil"
    31  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    32  	"github.com/matrixorigin/matrixone/pkg/util/trace"
    33  )
    34  
    35  const (
    36  	defaultWriteSocketSize = 64 * 1024
    37  )
    38  
    39  // IsTempError returns a boolean value indicating whether the specified error
    40  // is a temp error that worth to be retried, e.g. timeouts, temp network
    41  // issues. Non-temp error caused by program logics rather than some external
    42  // factors.
    43  func IsTempError(err error) bool {
    44  	return isTempError(err)
    45  }
    46  
    47  type ClientFactory func() (Client, error)
    48  
    49  // Client is the Log Service Client interface exposed to the DN.
    50  type Client interface {
    51  	// Close closes the client.
    52  	Close() error
    53  	// Config returns the specified configuration when creating the client.
    54  	Config() ClientConfig
    55  	// GetLogRecord returns a new LogRecord instance with its Data field enough
    56  	// to hold payloadLength bytes of payload. The layout of the Data field is
    57  	// 4 bytes of record type (pb.UserEntryUpdate) + 8 bytes TN replica ID +
    58  	// payloadLength bytes of actual payload.
    59  	GetLogRecord(payloadLength int) pb.LogRecord
    60  	// Append appends the specified LogRecord into the Log Service. On success, the
    61  	// assigned Lsn will be returned. For the specified LogRecord, only its Data
    62  	// field is used with all other fields ignored by Append(). Once returned, the
    63  	// pb.LogRecord can be reused.
    64  	Append(ctx context.Context, rec pb.LogRecord) (Lsn, error)
    65  	// Read reads the Log Service from the specified Lsn position until the
    66  	// returned LogRecord set reaches the specified maxSize in bytes. The returned
    67  	// Lsn indicates the next Lsn to use to resume the read, or it means
    68  	// everything available has been read when it equals to the specified Lsn.
    69  	// The returned pb.LogRecord records will have their Lsn and Type fields set,
    70  	// the Lsn field is the Lsn assigned to the record while the Type field tells
    71  	// whether the record is an internal record generated by the Log Service itself
    72  	// or appended by the user.
    73  	Read(ctx context.Context, firstLsn Lsn, maxSize uint64) ([]pb.LogRecord, Lsn, error)
    74  	// Truncate truncates the Log Service log at the specified Lsn with Lsn
    75  	// itself included. This allows the Log Service to free up storage capacities
    76  	// for future appends, all future reads must start after the specified Lsn
    77  	// position.
    78  	Truncate(ctx context.Context, lsn Lsn) error
    79  	// GetTruncatedLsn returns the largest Lsn value that has been specified for
    80  	// truncation.
    81  	GetTruncatedLsn(ctx context.Context) (Lsn, error)
    82  	// GetTSOTimestamp requests a total of count unique timestamps from the TSO and
    83  	// return the first assigned such timestamp, that is TSO timestamps
    84  	// [returned value, returned value + count] will be owned by the caller.
    85  	GetTSOTimestamp(ctx context.Context, count uint64) (uint64, error)
    86  }
    87  
    88  type managedClient struct {
    89  	cfg    ClientConfig
    90  	client *client
    91  }
    92  
    93  var _ Client = (*managedClient)(nil)
    94  
    95  // NewClient creates a Log Service client. Each returned client can be used
    96  // to synchronously issue requests to the Log Service. To send multiple requests
    97  // to the Log Service in parallel, multiple clients should be created and used
    98  // to do so.
    99  func NewClient(ctx context.Context, cfg ClientConfig) (Client, error) {
   100  	if err := cfg.Validate(); err != nil {
   101  		return nil, err
   102  	}
   103  	client, err := newClient(ctx, cfg)
   104  	if err != nil {
   105  		return nil, err
   106  	}
   107  	return &managedClient{cfg: cfg, client: client}, nil
   108  }
   109  
   110  func (c *managedClient) Close() error {
   111  	if c.client != nil {
   112  		return c.client.close()
   113  	}
   114  	return nil
   115  }
   116  
   117  func (c *managedClient) Config() ClientConfig {
   118  	return c.cfg
   119  }
   120  
   121  func (c *managedClient) GetLogRecord(payloadLength int) pb.LogRecord {
   122  	data := make([]byte, headerSize+8+payloadLength)
   123  	binaryEnc.PutUint32(data, uint32(pb.UserEntryUpdate))
   124  	binaryEnc.PutUint64(data[headerSize:], c.cfg.TNReplicaID)
   125  	return pb.LogRecord{Data: data}
   126  }
   127  
   128  func (c *managedClient) Append(ctx context.Context, rec pb.LogRecord) (Lsn, error) {
   129  	for {
   130  		if err := c.prepareClient(ctx); err != nil {
   131  			return 0, err
   132  		}
   133  		v, err := c.client.append(ctx, rec)
   134  		if err != nil {
   135  			c.resetClient()
   136  		}
   137  		if c.isRetryableError(err) {
   138  			continue
   139  		}
   140  		return v, err
   141  	}
   142  }
   143  
   144  func (c *managedClient) Read(ctx context.Context,
   145  	firstLsn Lsn, maxSize uint64) ([]pb.LogRecord, Lsn, error) {
   146  	for {
   147  		if err := c.prepareClient(ctx); err != nil {
   148  			return nil, 0, err
   149  		}
   150  		recs, v, err := c.client.read(ctx, firstLsn, maxSize)
   151  		if err != nil {
   152  			c.resetClient()
   153  		}
   154  		if c.isRetryableError(err) {
   155  			continue
   156  		}
   157  		return recs, v, err
   158  	}
   159  }
   160  
   161  func (c *managedClient) Truncate(ctx context.Context, lsn Lsn) error {
   162  	for {
   163  		if err := c.prepareClient(ctx); err != nil {
   164  			return err
   165  		}
   166  		err := c.client.truncate(ctx, lsn)
   167  		if err != nil {
   168  			c.resetClient()
   169  		}
   170  		if c.isRetryableError(err) {
   171  			continue
   172  		}
   173  		return err
   174  	}
   175  }
   176  
   177  func (c *managedClient) GetTruncatedLsn(ctx context.Context) (Lsn, error) {
   178  	for {
   179  		if err := c.prepareClient(ctx); err != nil {
   180  			return 0, err
   181  		}
   182  		v, err := c.client.getTruncatedLsn(ctx)
   183  		if err != nil {
   184  			c.resetClient()
   185  		}
   186  		if c.isRetryableError(err) {
   187  			continue
   188  		}
   189  		return v, err
   190  	}
   191  }
   192  
   193  func (c *managedClient) GetTSOTimestamp(ctx context.Context, count uint64) (uint64, error) {
   194  	for {
   195  		if err := c.prepareClient(ctx); err != nil {
   196  			return 0, err
   197  		}
   198  		v, err := c.client.getTSOTimestamp(ctx, count)
   199  		if err != nil {
   200  			c.resetClient()
   201  		}
   202  		if c.isRetryableError(err) {
   203  			continue
   204  		}
   205  		return v, err
   206  	}
   207  }
   208  
   209  func (c *managedClient) isRetryableError(err error) bool {
   210  	/*
   211  		old code, obviously strange
   212  		if errors.Is(err, dragonboat.ErrTimeout) {
   213  			return false
   214  		}
   215  		return errors.Is(err, dragonboat.ErrShardNotFound)
   216  	*/
   217  
   218  	// Dragonboat error leaked here
   219  	if errors.Is(err, dragonboat.ErrShardNotFound) {
   220  		return true
   221  	}
   222  	return moerr.IsMoErrCode(err, moerr.ErrDragonboatShardNotFound)
   223  }
   224  
   225  func (c *managedClient) resetClient() {
   226  	if c.client != nil {
   227  		cc := c.client
   228  		c.client = nil
   229  		if err := cc.close(); err != nil {
   230  			logutil.Error("failed to close client", zap.Error(err))
   231  		}
   232  	}
   233  }
   234  
   235  func (c *managedClient) prepareClient(ctx context.Context) error {
   236  	if c.client != nil {
   237  		return nil
   238  	}
   239  	cc, err := newClient(ctx, c.cfg)
   240  	if err != nil {
   241  		return err
   242  	}
   243  	c.client = cc
   244  	return nil
   245  }
   246  
   247  type client struct {
   248  	cfg      ClientConfig
   249  	client   morpc.RPCClient
   250  	addr     string
   251  	pool     *sync.Pool
   252  	respPool *sync.Pool
   253  }
   254  
   255  func newClient(ctx context.Context, cfg ClientConfig) (*client, error) {
   256  	var c *client
   257  	var err error
   258  	// If the discovery address is configured, we used it first.
   259  	if len(cfg.DiscoveryAddress) > 0 {
   260  		c, err = connectToLogServiceByReverseProxy(ctx, cfg.DiscoveryAddress, cfg)
   261  		if c != nil && err == nil {
   262  			return c, nil
   263  		}
   264  	} else if len(cfg.ServiceAddresses) > 0 {
   265  		c, err = connectToLogService(ctx, cfg.ServiceAddresses, cfg)
   266  		if c != nil && err == nil {
   267  			return c, nil
   268  		}
   269  	}
   270  	if err != nil {
   271  		return nil, err
   272  	}
   273  	return nil, moerr.NewLogServiceNotReady(ctx)
   274  }
   275  
   276  func connectToLogServiceByReverseProxy(ctx context.Context,
   277  	discoveryAddress string, cfg ClientConfig) (*client, error) {
   278  	si, ok, err := GetShardInfo(discoveryAddress, cfg.LogShardID)
   279  	if err != nil {
   280  		return nil, err
   281  	}
   282  	if !ok {
   283  		return nil, moerr.NewLogServiceNotReady(ctx)
   284  	}
   285  	addresses := make([]string, 0)
   286  	leaderAddress, ok := si.Replicas[si.ReplicaID]
   287  	if ok {
   288  		addresses = append(addresses, leaderAddress)
   289  	}
   290  	for replicaID, address := range si.Replicas {
   291  		if replicaID != si.ReplicaID {
   292  			addresses = append(addresses, address)
   293  		}
   294  	}
   295  	return connectToLogService(ctx, addresses, cfg)
   296  }
   297  
   298  func connectToLogService(ctx context.Context,
   299  	targets []string, cfg ClientConfig) (*client, error) {
   300  	if len(targets) == 0 {
   301  		return nil, nil
   302  	}
   303  
   304  	pool := &sync.Pool{}
   305  	pool.New = func() interface{} {
   306  		return &RPCRequest{pool: pool}
   307  	}
   308  	respPool := &sync.Pool{}
   309  	respPool.New = func() interface{} {
   310  		return &RPCResponse{pool: respPool}
   311  	}
   312  	c := &client{
   313  		cfg:      cfg,
   314  		pool:     pool,
   315  		respPool: respPool,
   316  	}
   317  	var e error
   318  	addresses := append([]string{}, targets...)
   319  	rand.Shuffle(len(cfg.ServiceAddresses), func(i, j int) {
   320  		addresses[i], addresses[j] = addresses[j], addresses[i]
   321  	})
   322  	for _, addr := range addresses {
   323  		cc, err := getRPCClient(
   324  			ctx,
   325  			addr,
   326  			c.respPool,
   327  			c.cfg.MaxMessageSize,
   328  			cfg.EnableCompress,
   329  			0,
   330  			cfg.Tag,
   331  		)
   332  		if err != nil {
   333  			e = err
   334  			continue
   335  		}
   336  		c.addr = addr
   337  		c.client = cc
   338  		if cfg.ReadOnly {
   339  			if err := c.connectReadOnly(ctx); err == nil {
   340  				return c, nil
   341  			} else {
   342  				if err := c.close(); err != nil {
   343  					logutil.Error("failed to close the client", zap.Error(err))
   344  				}
   345  				e = err
   346  			}
   347  		} else {
   348  			// TODO: add a test to check whether it works when there is no truncated
   349  			// LSN known to the logservice.
   350  			if err := c.connectReadWrite(ctx); err == nil {
   351  				return c, nil
   352  			} else {
   353  				if err := c.close(); err != nil {
   354  					logutil.Error("failed to close the client", zap.Error(err))
   355  				}
   356  				e = err
   357  			}
   358  		}
   359  	}
   360  	return nil, e
   361  }
   362  
   363  func (c *client) close() error {
   364  	return c.client.Close()
   365  }
   366  
   367  func (c *client) append(ctx context.Context, rec pb.LogRecord) (Lsn, error) {
   368  	if c.readOnly() {
   369  		return 0, moerr.NewInvalidInput(ctx, "incompatible client")
   370  	}
   371  	// TODO: check piggybacked hint on whether we are connected to the leader node
   372  	return c.doAppend(ctx, rec)
   373  }
   374  
   375  func (c *client) read(ctx context.Context,
   376  	firstLsn Lsn, maxSize uint64) ([]pb.LogRecord, Lsn, error) {
   377  	return c.doRead(ctx, firstLsn, maxSize)
   378  }
   379  
   380  func (c *client) truncate(ctx context.Context, lsn Lsn) error {
   381  	if c.readOnly() {
   382  		return moerr.NewInvalidInput(ctx, "incompatible client")
   383  	}
   384  	return c.doTruncate(ctx, lsn)
   385  }
   386  
   387  func (c *client) getTruncatedLsn(ctx context.Context) (Lsn, error) {
   388  	return c.doGetTruncatedLsn(ctx)
   389  }
   390  
   391  func (c *client) getTSOTimestamp(ctx context.Context, count uint64) (uint64, error) {
   392  	return c.tsoRequest(ctx, count)
   393  }
   394  
   395  func (c *client) readOnly() bool {
   396  	return c.cfg.ReadOnly
   397  }
   398  
   399  func (c *client) connectReadWrite(ctx context.Context) error {
   400  	if c.readOnly() {
   401  		panic(moerr.NewInvalidInput(ctx, "incompatible client"))
   402  	}
   403  	return c.connect(ctx, pb.CONNECT)
   404  }
   405  
   406  func (c *client) connectReadOnly(ctx context.Context) error {
   407  	return c.connect(ctx, pb.CONNECT_RO)
   408  }
   409  
   410  func (c *client) request(ctx context.Context,
   411  	mt pb.MethodType, payload []byte, lsn Lsn,
   412  	maxSize uint64) (pb.Response, []pb.LogRecord, error) {
   413  	ctx, span := trace.Debug(ctx, "client.request")
   414  	defer span.End()
   415  	req := pb.Request{
   416  		Method: mt,
   417  		LogRequest: pb.LogRequest{
   418  			ShardID: c.cfg.LogShardID,
   419  			TNID:    c.cfg.TNReplicaID,
   420  			Lsn:     lsn,
   421  			MaxSize: maxSize,
   422  		},
   423  	}
   424  	r := c.pool.Get().(*RPCRequest)
   425  	defer r.Release()
   426  	r.Request = req
   427  	r.payload = payload
   428  	future, err := c.client.Send(ctx, c.addr, r)
   429  	if err != nil {
   430  		return pb.Response{}, nil, err
   431  	}
   432  	defer future.Close()
   433  	msg, err := future.Get()
   434  	if err != nil {
   435  		return pb.Response{}, nil, err
   436  	}
   437  	response, ok := msg.(*RPCResponse)
   438  	if !ok {
   439  		panic("unexpected response type")
   440  	}
   441  	resp := response.Response
   442  	defer response.Release()
   443  	var recs pb.LogRecordResponse
   444  	if len(response.payload) > 0 {
   445  		MustUnmarshal(&recs, response.payload)
   446  	}
   447  	err = toError(ctx, response.Response)
   448  	if err != nil {
   449  		return pb.Response{}, nil, err
   450  	}
   451  	return resp, recs.Records, nil
   452  }
   453  
   454  func (c *client) tsoRequest(ctx context.Context, count uint64) (uint64, error) {
   455  	ctx, span := trace.Debug(ctx, "client.tsoRequest")
   456  	defer span.End()
   457  	req := pb.Request{
   458  		Method: pb.TSO_UPDATE,
   459  		TsoRequest: &pb.TsoRequest{
   460  			Count: count,
   461  		},
   462  	}
   463  	r := c.pool.Get().(*RPCRequest)
   464  	r.Request = req
   465  	future, err := c.client.Send(ctx, c.addr, r)
   466  	if err != nil {
   467  		return 0, err
   468  	}
   469  	defer future.Close()
   470  	msg, err := future.Get()
   471  	if err != nil {
   472  		return 0, err
   473  	}
   474  	response, ok := msg.(*RPCResponse)
   475  	if !ok {
   476  		panic("unexpected response type")
   477  	}
   478  	resp := response.Response
   479  	defer response.Release()
   480  	err = toError(ctx, response.Response)
   481  	if err != nil {
   482  		return 0, err
   483  	}
   484  	return resp.TsoResponse.Value, nil
   485  }
   486  
   487  func (c *client) connect(ctx context.Context, mt pb.MethodType) error {
   488  	_, _, err := c.request(ctx, mt, nil, 0, 0)
   489  	return err
   490  }
   491  
   492  func (c *client) doAppend(ctx context.Context, rec pb.LogRecord) (Lsn, error) {
   493  	resp, _, err := c.request(ctx, pb.APPEND, rec.Data, 0, 0)
   494  	if err != nil {
   495  		return 0, err
   496  	}
   497  	return resp.LogResponse.Lsn, nil
   498  }
   499  
   500  func (c *client) doRead(ctx context.Context,
   501  	firstLsn Lsn, maxSize uint64) ([]pb.LogRecord, Lsn, error) {
   502  	resp, recs, err := c.request(ctx, pb.READ, nil, firstLsn, maxSize)
   503  	if err != nil {
   504  		return nil, 0, err
   505  	}
   506  	return recs, resp.LogResponse.LastLsn, nil
   507  }
   508  
   509  func (c *client) doTruncate(ctx context.Context, lsn Lsn) error {
   510  	_, _, err := c.request(ctx, pb.TRUNCATE, nil, lsn, 0)
   511  	return err
   512  }
   513  
   514  func (c *client) doGetTruncatedLsn(ctx context.Context) (Lsn, error) {
   515  	resp, _, err := c.request(ctx, pb.GET_TRUNCATE, nil, 0, 0)
   516  	if err != nil {
   517  		return 0, err
   518  	}
   519  	return resp.LogResponse.Lsn, nil
   520  }
   521  
   522  func getRPCClient(
   523  	ctx context.Context,
   524  	target string,
   525  	pool *sync.Pool,
   526  	maxMessageSize int,
   527  	enableCompress bool,
   528  	readTimeout time.Duration,
   529  	tag ...string) (morpc.RPCClient, error) {
   530  	mf := func() morpc.Message {
   531  		return pool.Get().(*RPCResponse)
   532  	}
   533  
   534  	// construct morpc.BackendOption
   535  	backendOpts := []morpc.BackendOption{
   536  		morpc.WithBackendConnectTimeout(time.Second),
   537  		morpc.WithBackendHasPayloadResponse(),
   538  		morpc.WithBackendLogger(logutil.GetGlobalLogger().Named("hakeeper-client-backend")),
   539  		morpc.WithBackendReadTimeout(readTimeout),
   540  	}
   541  	backendOpts = append(backendOpts, GetBackendOptions(ctx)...)
   542  
   543  	// construct morpc.ClientOption
   544  	clientOpts := []morpc.ClientOption{
   545  		morpc.WithClientInitBackends([]string{target}, []int{1}),
   546  		morpc.WithClientMaxBackendPerHost(1),
   547  		morpc.WithClientLogger(logutil.GetGlobalLogger()),
   548  	}
   549  	clientOpts = append(clientOpts, GetClientOptions(ctx)...)
   550  
   551  	var codecOpts []morpc.CodecOption
   552  	codecOpts = append(codecOpts,
   553  		morpc.WithCodecPayloadCopyBufferSize(defaultWriteSocketSize),
   554  		morpc.WithCodecEnableChecksum(),
   555  		morpc.WithCodecMaxBodySize(maxMessageSize))
   556  	if enableCompress {
   557  		mp, err := mpool.NewMPool("log_rpc_client", 0, mpool.NoFixed)
   558  		if err != nil {
   559  			return nil, err
   560  		}
   561  		codecOpts = append(codecOpts, morpc.WithCodecEnableCompress(mp))
   562  	}
   563  
   564  	// we set connection timeout to a constant value so if ctx's deadline is much
   565  	// larger, then we can ensure that all specified potential nodes have a chance
   566  	// to be attempted
   567  	codec := morpc.NewMessageCodec(mf, codecOpts...)
   568  	bf := morpc.NewGoettyBasedBackendFactory(codec, backendOpts...)
   569  	return morpc.NewClient("logservice-client", bf, clientOpts...)
   570  }