github.com/whamcloud/lemur@v0.0.0-20190827193804-4655df8a52af/cmd/lhsmd/agent/agent.go (about)

     1  // Copyright (c) 2018 DDN. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  /*
     6  Package agent implements a Parallel Data Mover to copy or migrate data between
     7  various storage systems. It supports multliple types of sources and
     8  destinations, including POSIX, S3, HPSS, etc.
     9  
    10  Use cases include:
    11    * Data movement for Lustre HSM.
    12    * Offsite replication for DR
    13    * Lustre file-level replication
    14    * Storage rebalancing within a single tier
    15    * Migration between filesytems (e.g GPFS - > Lustre)
    16  
    17  Initially the main focus is for HSM.
    18  */
    19  package agent
    20  
    21  import (
    22  	"fmt"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/pkg/errors"
    27  
    28  	"golang.org/x/net/context"
    29  	"golang.org/x/sys/unix"
    30  
    31  	"github.com/intel-hpdd/lemur/pkg/fsroot"
    32  	"github.com/intel-hpdd/logging/alert"
    33  	"github.com/intel-hpdd/logging/debug"
    34  	"github.com/intel-hpdd/go-lustre/fs"
    35  	"github.com/intel-hpdd/go-lustre/hsm"
    36  	"github.com/intel-hpdd/go-lustre/llapi"
    37  )
    38  
    39  type (
    40  	// HsmAgent for a single filesytem and a collection of backends.
    41  	HsmAgent struct {
    42  		config        *Config
    43  		client        fsroot.Client
    44  		stats         *ActionStats
    45  		wg            sync.WaitGroup
    46  		Endpoints     *Endpoints
    47  		mu            sync.Mutex // Protect the agent
    48  		actionSource  hsm.ActionSource
    49  		monitor       *PluginMonitor
    50  		cancelFunc    context.CancelFunc
    51  		rpcsInFlight  chan struct{} // Buffered channel to throttle rpcs in flight
    52  		startComplete chan struct{} // Closed when agent startup is completed
    53  		stopComplete  chan struct{} // Closed when agent shutdown is completed
    54  	}
    55  
    56  	// Transport for backend plugins
    57  	Transport interface {
    58  		Init(*Config, *HsmAgent) error
    59  		Shutdown()
    60  	}
    61  )
    62  
    63  // New accepts a config and returns a *HsmAgent
    64  func New(cfg *Config, client fsroot.Client, as hsm.ActionSource) (*HsmAgent, error) {
    65  	ct := &HsmAgent{
    66  		config:        cfg,
    67  		client:        client,
    68  		rpcsInFlight:  make(chan struct{}, cfg.Processes*10),
    69  		stats:         NewActionStats(),
    70  		monitor:       NewMonitor(),
    71  		actionSource:  as,
    72  		Endpoints:     NewEndpoints(),
    73  		startComplete: make(chan struct{}),
    74  		stopComplete:  make(chan struct{}),
    75  	}
    76  
    77  	return ct, nil
    78  }
    79  
    80  // Start backgrounds the agent and starts backend data movers
    81  func (ct *HsmAgent) Start(ctx context.Context) error {
    82  	ct.mu.Lock()
    83  	ctx, ct.cancelFunc = context.WithCancel(ctx)
    84  	ct.mu.Unlock()
    85  	ct.stats.Start(ctx)
    86  
    87  	if t, ok := transports[ct.config.Transport.Type]; ok {
    88  		if err := t.Init(ct.config, ct); err != nil {
    89  			return errors.Wrapf(err, "transport %q initialize failed", ct.config.Transport.Type)
    90  		}
    91  	} else {
    92  		return errors.Errorf("unknown transport type in configuration: %s", ct.config.Transport.Type)
    93  	}
    94  
    95  	if err := ct.actionSource.Start(ctx); err != nil {
    96  		return errors.Wrap(err, "initializing HSM agent connection")
    97  	}
    98  
    99  	for i := 0; i < ct.config.Processes; i++ {
   100  		ct.addHandler(fmt.Sprintf("handler-%d", i))
   101  	}
   102  
   103  	ct.monitor.Start(ctx)
   104  	for _, pluginConf := range ct.config.Plugins() {
   105  		err := ct.monitor.StartPlugin(pluginConf)
   106  		if err != nil {
   107  			return errors.Wrapf(err, "creating plugin %q", pluginConf.Name)
   108  		}
   109  	}
   110  	close(ct.startComplete)
   111  	ct.wg.Wait()
   112  	close(ct.stopComplete)
   113  	return nil
   114  }
   115  
   116  // Stop shuts down all backend data movers and kills the agent
   117  func (ct *HsmAgent) Stop() {
   118  	ct.mu.Lock()
   119  	ct.cancelFunc()
   120  	ct.mu.Unlock()
   121  	transports[ct.config.Transport.Type].Shutdown()
   122  	<-ct.stopComplete
   123  }
   124  
   125  // StartWaitFor will wait for Agent to startup with time out of n.
   126  func (ct *HsmAgent) StartWaitFor(n time.Duration) error {
   127  	select {
   128  	case <-ct.startComplete:
   129  		return nil
   130  	case <-time.After(n):
   131  		return errors.Errorf("Agent startup timed out after %v", n)
   132  	}
   133  
   134  }
   135  
   136  // Root returns a fs.RootDir representing the Lustre filesystem root
   137  func (ct *HsmAgent) Root() fs.RootDir {
   138  	return ct.client.Root()
   139  }
   140  
   141  func (ct *HsmAgent) newAction(aih hsm.ActionHandle) *Action {
   142  	return &Action{
   143  		id:    NextActionID(),
   144  		aih:   aih,
   145  		start: time.Now(),
   146  		agent: ct,
   147  	}
   148  }
   149  
   150  func (ct *HsmAgent) handleActions(tag string) {
   151  	for ai := range ct.actionSource.Actions() {
   152  		debug.Printf("%s: incoming: %s", tag, ai)
   153  		// AFAICT, this is how the copytool is expected to handle cancels.
   154  		if ai.Action() == llapi.HsmActionCancel {
   155  			ai.FailImmediately(int(unix.ENOSYS))
   156  			// TODO: send out of band cancel message to the mover
   157  			continue
   158  		}
   159  		aih, err := ai.Begin(0, false)
   160  		if err != nil {
   161  			alert.Warnf("%s: begin failed: %v: %s", tag, err, ai)
   162  			ai.FailImmediately(int(unix.EIO))
   163  			continue
   164  		}
   165  		action := ct.newAction(aih)
   166  		ct.rpcsInFlight <- struct{}{}
   167  		ct.stats.StartAction(action)
   168  		action.Prepare()
   169  		if e, ok := ct.Endpoints.Get(uint32(aih.ArchiveID())); ok {
   170  			debug.Printf("%s: id:%d new %s %x %v", tag, action.id,
   171  				action.aih.Action(),
   172  				action.aih.Cookie(),
   173  				action.aih.Fid())
   174  			e.Send(action)
   175  		} else {
   176  			alert.Warnf("no handler for archive %d", aih.ArchiveID())
   177  			action.Fail(-1)
   178  			ct.stats.CompleteAction(action, -1)
   179  		}
   180  	}
   181  }
   182  
   183  func (ct *HsmAgent) addHandler(tag string) {
   184  	ct.wg.Add(1)
   185  	go func() {
   186  		ct.handleActions(tag)
   187  		ct.wg.Done()
   188  	}()
   189  }
   190  
   191  var transports = map[string]Transport{}
   192  
   193  // RegisterTransport registers the transport in the list of known transports
   194  func RegisterTransport(name string, t Transport) {
   195  	transports[name] = t
   196  }