github.com/square/finch@v0.0.0-20240412205204-6530c03e2b96/compute/client.go (about)

     1  // Copyright 2023 Block, Inc.
     2  
     3  package compute
     4  
     5  import (
     6  	"context"
     7  	"encoding/json"
     8  	"fmt"
     9  	"log"
    10  	"net/http"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"time"
    15  
    16  	"github.com/square/finch"
    17  	"github.com/square/finch/config"
    18  	"github.com/square/finch/data"
    19  	"github.com/square/finch/proto"
    20  	"github.com/square/finch/stage"
    21  	"github.com/square/finch/stats"
    22  )
    23  
    24  // Client is a remote Instance that sends everything to the --server specified
    25  // on the command line. The client handles client-server communication, and it
    26  // wraps a Local that runs stages locally.
    27  type Client struct {
    28  	name string
    29  	addr string
    30  	// --
    31  	gds    *data.Scope
    32  	client *proto.Client
    33  }
    34  
    35  func NewClient(name, addr string) *Client {
    36  	if !strings.HasPrefix(addr, "http://") {
    37  		addr = "http://" + addr
    38  	}
    39  
    40  	return &Client{
    41  		name: name,
    42  		addr: strings.TrimSuffix(addr, "/"),
    43  		// --
    44  		gds:    data.NewScope(),
    45  		client: proto.NewClient(name, addr),
    46  	}
    47  }
    48  
    49  func (c *Client) Run(ctxFinch context.Context) error {
    50  	//for {
    51  	c.gds.Reset() // keep data from globally-scoped generators; delete the rest
    52  	if err := c.run(ctxFinch); err != nil {
    53  		if ctxFinch.Err() != nil {
    54  			return nil
    55  		}
    56  		log.Println(err)
    57  		time.Sleep(2 * time.Second) // prevent uncontrolled error loop
    58  	}
    59  	//}
    60  	return nil
    61  }
    62  
    63  func (c *Client) run(ctxFinch context.Context) error {
    64  	// ------------------------------------------------------------------
    65  	// Fetch stage fails (wait for GET /boot to return)
    66  	var cfg config.Stage
    67  	log.Printf("Waiting to boot from %s...", c.addr)
    68  	c.client.PrintErrors = false
    69  	_, body, err := c.client.Get(ctxFinch, "/boot", nil, proto.R{2 * time.Second, 1 * time.Second, -1})
    70  	if err != nil {
    71  		return err
    72  	}
    73  	c.client.PrintErrors = true
    74  	if err := json.Unmarshal(body, &cfg); err != nil {
    75  		return fmt.Errorf("cannot decode stage config file from server: %s", err)
    76  	}
    77  	stageName := cfg.Name
    78  	c.client.StageId = cfg.Id
    79  	defer func() { c.client.StageId = "" }()
    80  	fmt.Printf("#\n# %s (%s)\n#\n", stageName, cfg.Id)
    81  
    82  	// ----------------------------------------------------------------------
    83  	// Fetch all stage and trx files from server, put in local temp dir
    84  	tmpdir, err := os.MkdirTemp("", "finch")
    85  	if err != nil {
    86  		return fmt.Errorf("cannot make temp dir: %s", err)
    87  	}
    88  	if !finch.Debugging {
    89  		defer os.RemoveAll(tmpdir)
    90  	}
    91  	finch.Debug("tmp dir: %s", tmpdir)
    92  	if err := c.getTrxFiles(ctxFinch, cfg, tmpdir); err != nil {
    93  		return err
    94  	}
    95  
    96  	// ------------------------------------------------------------------
    97  	// Local boot and ack
    98  	for k := range cfg.Stats.Report {
    99  		if k == "stdout" {
   100  			continue
   101  		}
   102  		delete(cfg.Stats.Report, k)
   103  	}
   104  	cfg.Stats.Report["server"] = map[string]string{
   105  		"server":   c.addr,
   106  		"client":   c.name,
   107  		"stage-id": c.client.StageId,
   108  	}
   109  	stats, err := stats.NewCollector(cfg.Stats, c.name, 1)
   110  	if err != nil {
   111  		return err
   112  	}
   113  
   114  	log.Printf("[%s] Booting", stageName)
   115  	local := stage.New(cfg, c.gds, stats)
   116  	if err := local.Prepare(ctxFinch); err != nil {
   117  		log.Printf("[%s] Boot error, notifying server: %s", stageName, err)
   118  		c.client.Send(ctxFinch, "/boot", err.Error(), proto.R{500 * time.Millisecond, 100 * time.Millisecond, 3}) // don't care if this fails
   119  		return err                                                                                                // return original error not Send error
   120  	}
   121  
   122  	// Boot ack; don't continue on error because we're no longer in sync with server
   123  	log.Printf("[%s] Boot successful, notifying server", stageName)
   124  	if err := c.client.Send(ctxFinch, "/boot", nil, proto.R{1 * time.Second, 100 * time.Millisecond, 10}); err != nil {
   125  		log.Printf("[%s] Sending book ack to server failed: %s", stageName, err)
   126  		return err
   127  	}
   128  
   129  	// ----------------------------------------------------------------------
   130  	// Wait for run signal. This might be a little while if server is for
   131  	// other remote instances.
   132  	log.Printf("[%s] Waiting for run signal", stageName)
   133  	resp, _, err := c.client.Get(ctxFinch, "/run", nil, proto.R{60 * time.Second, 100 * time.Millisecond, 3})
   134  	if err != nil {
   135  		log.Printf("[%s] Timeout waiting for run signal after successful boot, giving up (is the server offline?)", stageName)
   136  		return err
   137  	}
   138  	if resp.StatusCode == http.StatusResetContent {
   139  		log.Printf("[%s] Boot test successful", stageName)
   140  		return nil
   141  	}
   142  
   143  	// ----------------------------------------------------------------------
   144  	// Local run and ack
   145  	ctxRun, cancelRun := context.WithCancel(ctxFinch)
   146  	doneChan := make(chan struct{})
   147  	defer close(doneChan)
   148  	lostServer := false
   149  	stageDone := false
   150  	go func() {
   151  		defer cancelRun()
   152  		for {
   153  			time.Sleep(1 * time.Second)
   154  			select {
   155  			case <-doneChan:
   156  				finch.Debug("stop check goroutine stopped")
   157  				return
   158  			default:
   159  			}
   160  			resp, _, err := c.client.Get(ctxFinch, "/ping", nil, proto.R{500 * time.Millisecond, 100 * time.Millisecond, 5})
   161  			if err != nil {
   162  				log.Printf("[%s] Lost contact with server while running, aborting", stageName)
   163  				lostServer = true
   164  				return
   165  			}
   166  			if resp.StatusCode == http.StatusResetContent {
   167  				stageDone = true
   168  				log.Printf("[%s] Server stopped stage", stageName)
   169  				return
   170  			}
   171  		}
   172  	}()
   173  
   174  	local.Run(ctxRun)
   175  	log.Printf("[%s] Run stopped: %v (lost server:%v stage stopped:%v); sending done signal to server (5s timeout)", stageName, err, lostServer, stageDone)
   176  
   177  	// Run ack; ok if this fails because we're done, nothing left to sync with server
   178  	ctxDone, ctxCancel := context.WithTimeout(context.Background(), 5*time.Second)
   179  	defer ctxCancel()
   180  	if err := c.client.Send(ctxDone, "/run", err, proto.R{500 * time.Millisecond, 100 * time.Millisecond, 3}); err != nil {
   181  		log.Printf("[%s] Sending done signal to server failed, ignoring: %s", stageName, err)
   182  	}
   183  
   184  	return nil
   185  }
   186  
   187  func (c *Client) getTrxFiles(ctxFinch context.Context, cfg config.Stage, tmpdir string) error {
   188  	trx := cfg.Trx
   189  	for i := range trx {
   190  		if config.FileExists(trx[i].File) {
   191  			log.Printf("Have local stage %s file %s; not fetching from server", cfg.Name, trx[i].File)
   192  			continue
   193  		}
   194  		log.Printf("Fetching stage %s file %s...", cfg.Name, trx[i].File)
   195  		ref := [][]string{
   196  			{"stage", cfg.Name},
   197  			{"i", fmt.Sprintf("%d", i)},
   198  		}
   199  		resp, body, err := c.client.Get(ctxFinch, "/file", ref, proto.R{5 * time.Second, 100 * time.Millisecond, 3})
   200  		if err != nil {
   201  			return err // Get retries so error is final
   202  		}
   203  		finch.Debug("%+v", resp)
   204  
   205  		filename := filepath.Join(tmpdir, filepath.Base(trx[i].File))
   206  		f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, 0440)
   207  		if err != nil {
   208  			return err
   209  		}
   210  		if _, err := f.Write(body); err != nil {
   211  			return err
   212  		}
   213  		if err := f.Close(); err != nil {
   214  			return err
   215  		}
   216  		finch.Debug("wrote %s", filename)
   217  		trx[i].File = filename
   218  	}
   219  	return nil
   220  }