github.com/swiftstack/ProxyFS@v0.0.0-20210203235616-4017c267d62f/retryrpc/api.go (about) 1 // Copyright (c) 2015-2021, NVIDIA CORPORATION. 2 // SPDX-License-Identifier: Apache-2.0 3 4 package retryrpc 5 6 // Package retryrpc provides a client and server RPC model which survives 7 // lost connections on either the client or the server. 8 // 9 // NOTE: This package does handle cases where the server process dies. There 10 // are still gaps where a server may complete an RPC and die before returning 11 // a response. 12 13 import ( 14 "container/list" 15 "context" 16 "crypto/tls" 17 "crypto/x509" 18 "fmt" 19 "net" 20 "reflect" 21 "sync" 22 "time" 23 24 "github.com/google/btree" 25 "github.com/swiftstack/ProxyFS/bucketstats" 26 "github.com/swiftstack/ProxyFS/logger" 27 ) 28 29 // ServerCreds tracks the root CA and the 30 // server CA 31 type ServerCreds struct { 32 RootCAx509CertificatePEM []byte 33 serverTLSCertificate tls.Certificate 34 } 35 36 // Server tracks the state of the server 37 type Server struct { 38 sync.Mutex 39 completedLongTTL time.Duration // How long a completed request stays on queue 40 completedAckTrim time.Duration // How frequently trim requests acked by client 41 svrMap map[string]*methodArgs // Key: Method name 42 ipaddr string // IP address server listens too 43 port int // Port of server 44 netListener net.Listener 45 tlsListener net.Listener 46 47 halting bool 48 goroutineWG sync.WaitGroup // Used to track outstanding goroutines 49 connLock sync.Mutex 50 connections *list.List 51 connWG sync.WaitGroup 52 Creds *ServerCreds 53 listenersWG sync.WaitGroup 54 receiver reflect.Value // Package receiver being served 55 perClientInfo map[string]*clientInfo // Key: "clientID". Tracks clients 56 completedTickerDone chan bool 57 completedLongTicker *time.Ticker // Longer ~10 minute timer to trim 58 completedShortTicker *time.Ticker // Shorter ~100ms timer to trim known completed 59 deadlineIO time.Duration 60 keepAlivePeriod time.Duration 61 completedDoneWG sync.WaitGroup 62 dontStartTrimmers bool // Used for testing 63 } 64 65 // ServerConfig is used to configure a retryrpc Server 66 type ServerConfig struct { 67 LongTrim time.Duration // How long the results of an RPC are stored on a Server before removed 68 ShortTrim time.Duration // How frequently completed and ACKed RPCs results are removed from Server 69 IPAddr string // IP Address that Server uses to listen 70 Port int // Port that Server uses to listen 71 DeadlineIO time.Duration // How long I/Os on sockets wait even if idle 72 KeepAlivePeriod time.Duration // How frequently a KEEPALIVE is sent 73 dontStartTrimmers bool // Used for testing 74 } 75 76 // NewServer creates the Server object 77 func NewServer(config *ServerConfig) *Server { 78 var ( 79 err error 80 ) 81 server := &Server{ipaddr: config.IPAddr, port: config.Port, completedLongTTL: config.LongTrim, 82 completedAckTrim: config.ShortTrim, deadlineIO: config.DeadlineIO, 83 keepAlivePeriod: config.KeepAlivePeriod, dontStartTrimmers: config.dontStartTrimmers} 84 server.svrMap = make(map[string]*methodArgs) 85 server.perClientInfo = make(map[string]*clientInfo) 86 server.completedTickerDone = make(chan bool) 87 server.connections = list.New() 88 89 server.Creds, err = constructServerCreds(server.ipaddr) 90 if err != nil { 91 logger.Errorf("Construction of server credentials failed with err: %v", err) 92 panic(err) 93 } 94 95 return server 96 } 97 98 // Register creates the map of server methods 99 func (server *Server) Register(retrySvr interface{}) (err error) { 100 101 // Find all the methods associated with retrySvr and put into serviceMap 102 server.receiver = reflect.ValueOf(retrySvr) 103 return server.register(retrySvr) 104 } 105 106 // Start listener 107 func (server *Server) Start() (err error) { 108 portStr := fmt.Sprintf("%d", server.port) 109 hostPortStr := net.JoinHostPort(server.ipaddr, portStr) 110 111 tlsConfig := &tls.Config{ 112 Certificates: []tls.Certificate{server.Creds.serverTLSCertificate}, 113 } 114 115 listenConfig := &net.ListenConfig{KeepAlive: server.keepAlivePeriod} 116 server.netListener, err = listenConfig.Listen(context.Background(), "tcp", hostPortStr) 117 if nil != err { 118 err = fmt.Errorf("tls.Listen() failed: %v", err) 119 return 120 } 121 122 server.tlsListener = tls.NewListener(server.netListener, tlsConfig) 123 124 server.listenersWG.Add(1) 125 126 // Some of the unit tests disable starting trimmers 127 if !server.dontStartTrimmers { 128 // Start ticker which removes older completedRequests 129 server.completedLongTicker = time.NewTicker(server.completedLongTTL) 130 // Start ticker which removes requests already ACKed by client 131 server.completedShortTicker = time.NewTicker(server.completedAckTrim) 132 } 133 server.completedDoneWG.Add(1) 134 if !server.dontStartTrimmers { 135 go func() { 136 for { 137 select { 138 case <-server.completedTickerDone: 139 server.completedDoneWG.Done() 140 return 141 case tl := <-server.completedLongTicker.C: 142 server.trimCompleted(tl, true) 143 case ts := <-server.completedShortTicker.C: 144 server.trimCompleted(ts, false) 145 } 146 } 147 }() 148 } else { 149 go func() { 150 for { 151 select { 152 case <-server.completedTickerDone: 153 server.completedDoneWG.Done() 154 return 155 } 156 } 157 }() 158 } 159 160 return err 161 } 162 163 // Run server loop, accept connections, read request, run RPC method and 164 // return the results. 165 func (server *Server) Run() { 166 server.goroutineWG.Add(1) 167 go server.run() 168 } 169 170 // SendCallback sends a message to clientID so that clientID contacts 171 // the RPC server. 172 // 173 // The assumption is that this callback only gets called when the server has 174 // an async message for the client 175 // 176 // The message is "best effort" - if we fail to write on socket then the 177 // message is silently dropped on floor. 178 func (server *Server) SendCallback(clientID string, msg []byte) { 179 180 // TODO - what if client no longer in list of current clients? 181 var ( 182 localIOR ioReply 183 ) 184 server.Lock() 185 lci, ok := server.perClientInfo[clientID] 186 if !ok { 187 fmt.Printf("SERVER: SendCallback() - unable to find client UniqueID: %v\n", clientID) 188 server.Unlock() 189 return 190 } 191 server.Unlock() 192 193 lci.Lock() 194 currentCtx := lci.cCtx 195 lci.Unlock() 196 197 localIOR.JResult = msg 198 setupHdrReply(&localIOR, Upcall) 199 200 server.returnResults(&localIOR, currentCtx) 201 } 202 203 // Close stops the server 204 func (server *Server) Close() { 205 server.Lock() 206 server.halting = true 207 server.Unlock() 208 209 err := server.tlsListener.Close() 210 if err != nil { 211 logger.Errorf("server.tlsListener.Close() returned err: %v", err) 212 } 213 214 server.listenersWG.Wait() 215 216 server.goroutineWG.Wait() 217 218 // Now close the client sockets to wakeup them up 219 server.closeClientConn() 220 221 if !server.dontStartTrimmers { 222 server.completedLongTicker.Stop() 223 server.completedShortTicker.Stop() 224 } 225 server.completedTickerDone <- true 226 server.completedDoneWG.Wait() 227 228 // Cleanup bucketstats so that unit tests can run 229 for _, ci := range server.perClientInfo { 230 ci.Lock() 231 bucketstats.UnRegister("proxyfs.retryrpc", ci.myUniqueID) 232 ci.Unlock() 233 234 } 235 } 236 237 // CloseClientConn - This is debug code to cause some connections to be closed 238 // It is called from a stress test case to cause retransmits 239 func (server *Server) CloseClientConn() { 240 if server == nil { 241 return 242 } 243 server.connLock.Lock() 244 for c := server.connections.Front(); c != nil; c = c.Next() { 245 conn := c.Value.(net.Conn) 246 /* DEBUG code 247 fmt.Printf("SERVER - closing localaddr conn: %v remoteaddr: %v\n", conn.LocalAddr().String(), conn.RemoteAddr().String()) 248 */ 249 conn.Close() 250 } 251 server.connLock.Unlock() 252 } 253 254 // CompletedCnt returns count of pendingRequests 255 // 256 // This is only useful for testing. 257 func (server *Server) CompletedCnt() (totalCnt int) { 258 for _, v := range server.perClientInfo { 259 totalCnt += v.completedCnt() 260 } 261 return 262 } 263 264 // Client methods 265 type clientState int 266 267 const ( 268 // INITIAL means the Client struct has just been created 269 INITIAL clientState = iota + 1 270 // DISCONNECTED means the Client has lost the connection to the server 271 DISCONNECTED 272 // CONNECTED means the Client is connected to the server 273 CONNECTED 274 // RETRANSMITTING means a goroutine is in the middle of recovering 275 // from a loss of a connection with the server 276 RETRANSMITTING 277 ) 278 279 type connectionTracker struct { 280 state clientState 281 genNum uint64 // Generation number of tlsConn - avoid racing recoveries 282 tlsConfig *tls.Config 283 tlsConn *tls.Conn // Our connection to the server 284 x509CertPool *x509.CertPool 285 rootCAx509CertificatePEM []byte 286 hostPortStr string 287 } 288 289 // Client tracking structure 290 type Client struct { 291 sync.Mutex 292 halting bool 293 currentRequestID requestID // Last request ID - start from clock 294 // tick at mount and increment from there? 295 // Handle reset of time? 296 connection connectionTracker 297 myUniqueID string // Unique ID across all clients 298 cb interface{} // Callbacks to client 299 deadlineIO time.Duration 300 keepAlivePeriod time.Duration 301 outstandingRequest map[requestID]*reqCtx // Map of outstanding requests sent 302 // or to be sent to server. Key is assigned from currentRequestID 303 highestConsecutive requestID // Highest requestID that can be 304 // trimmed 305 bt *btree.BTree // btree of requestID's acked 306 goroutineWG sync.WaitGroup // Used to track outstanding goroutines 307 stats clientSideStatsInfo 308 } 309 310 // ClientCallbacks contains the methods required when supporting 311 // callbacks from the Server. 312 type ClientCallbacks interface { 313 Interrupt(payload []byte) 314 } 315 316 // ClientConfig is used to configure a retryrpc Client 317 type ClientConfig struct { 318 MyUniqueID string 319 IPAddr string // IP Address of Server 320 Port int // Port of Server 321 RootCAx509CertificatePEM []byte // Root certificate 322 Callbacks interface{} // Structure implementing ClientCallbacks 323 DeadlineIO time.Duration // How long I/Os on sockets wait even if idle 324 KeepAlivePeriod time.Duration // How frequently a KEEPALIVE is sent 325 } 326 327 // TODO - pass loggers to Cient and Server objects 328 329 // NewClient returns a Client structure 330 // 331 // If the server wants to send an async message to the client 332 // it uses the Interrupt method defined in cb 333 // 334 // NOTE: It is assumed that if a client calls NewClient(), it will 335 // always use a unique myUniqueID. Otherwise, the server may have 336 // old entries. 337 // 338 // TODO - purge cache of old entries on server and/or use different 339 // starting point for requestID. 340 func NewClient(config *ClientConfig) (client *Client, err error) { 341 342 client = &Client{myUniqueID: config.MyUniqueID, cb: config.Callbacks, 343 keepAlivePeriod: config.KeepAlivePeriod, deadlineIO: config.DeadlineIO} 344 portStr := fmt.Sprintf("%d", config.Port) 345 client.connection.state = INITIAL 346 client.connection.hostPortStr = net.JoinHostPort(config.IPAddr, portStr) 347 client.outstandingRequest = make(map[requestID]*reqCtx) 348 client.connection.x509CertPool = x509.NewCertPool() 349 client.bt = btree.New(2) 350 351 // Add cert for root CA to our pool 352 ok := client.connection.x509CertPool.AppendCertsFromPEM(config.RootCAx509CertificatePEM) 353 if !ok { 354 err = fmt.Errorf("x509CertPool.AppendCertsFromPEM() returned !ok") 355 return nil, err 356 } 357 358 bucketstats.Register("proxyfs.retryrpc", client.GetStatsGroupName(), &client.stats) 359 360 return client, err 361 } 362 363 // Send the request and block until it has completed 364 func (client *Client) Send(method string, request interface{}, reply interface{}) (err error) { 365 366 return client.send(method, request, reply) 367 } 368 369 // GetStatsGroupName returns the bucketstats GroupName for this client 370 func (client *Client) GetStatsGroupName() (s string) { 371 372 return clientSideGroupPrefix + client.myUniqueID 373 } 374 375 // Close gracefully shuts down the client 376 func (client *Client) Close() { 377 // Set halting flag and then close our socket to server. 378 // This will cause the blocked getIO() in readReplies() to return. 379 client.Lock() 380 client.halting = true 381 if client.connection.state == CONNECTED { 382 client.connection.state = INITIAL 383 client.connection.tlsConn.Close() 384 } 385 client.Unlock() 386 387 // Wait for the goroutines to return 388 client.goroutineWG.Wait() 389 bucketstats.UnRegister("proxyfs.retryrpc", client.GetStatsGroupName()) 390 391 }