github.com/whamcloud/lemur@v0.0.0-20190827193804-4655df8a52af/cmd/lhsmd/agent/agent.go (about) 1 // Copyright (c) 2018 DDN. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 /* 6 Package agent implements a Parallel Data Mover to copy or migrate data between 7 various storage systems. It supports multliple types of sources and 8 destinations, including POSIX, S3, HPSS, etc. 9 10 Use cases include: 11 * Data movement for Lustre HSM. 12 * Offsite replication for DR 13 * Lustre file-level replication 14 * Storage rebalancing within a single tier 15 * Migration between filesytems (e.g GPFS - > Lustre) 16 17 Initially the main focus is for HSM. 18 */ 19 package agent 20 21 import ( 22 "fmt" 23 "sync" 24 "time" 25 26 "github.com/pkg/errors" 27 28 "golang.org/x/net/context" 29 "golang.org/x/sys/unix" 30 31 "github.com/intel-hpdd/lemur/pkg/fsroot" 32 "github.com/intel-hpdd/logging/alert" 33 "github.com/intel-hpdd/logging/debug" 34 "github.com/intel-hpdd/go-lustre/fs" 35 "github.com/intel-hpdd/go-lustre/hsm" 36 "github.com/intel-hpdd/go-lustre/llapi" 37 ) 38 39 type ( 40 // HsmAgent for a single filesytem and a collection of backends. 41 HsmAgent struct { 42 config *Config 43 client fsroot.Client 44 stats *ActionStats 45 wg sync.WaitGroup 46 Endpoints *Endpoints 47 mu sync.Mutex // Protect the agent 48 actionSource hsm.ActionSource 49 monitor *PluginMonitor 50 cancelFunc context.CancelFunc 51 rpcsInFlight chan struct{} // Buffered channel to throttle rpcs in flight 52 startComplete chan struct{} // Closed when agent startup is completed 53 stopComplete chan struct{} // Closed when agent shutdown is completed 54 } 55 56 // Transport for backend plugins 57 Transport interface { 58 Init(*Config, *HsmAgent) error 59 Shutdown() 60 } 61 ) 62 63 // New accepts a config and returns a *HsmAgent 64 func New(cfg *Config, client fsroot.Client, as hsm.ActionSource) (*HsmAgent, error) { 65 ct := &HsmAgent{ 66 config: cfg, 67 client: client, 68 rpcsInFlight: make(chan struct{}, cfg.Processes*10), 69 stats: NewActionStats(), 70 monitor: NewMonitor(), 71 actionSource: as, 72 Endpoints: NewEndpoints(), 73 startComplete: make(chan struct{}), 74 stopComplete: make(chan struct{}), 75 } 76 77 return ct, nil 78 } 79 80 // Start backgrounds the agent and starts backend data movers 81 func (ct *HsmAgent) Start(ctx context.Context) error { 82 ct.mu.Lock() 83 ctx, ct.cancelFunc = context.WithCancel(ctx) 84 ct.mu.Unlock() 85 ct.stats.Start(ctx) 86 87 if t, ok := transports[ct.config.Transport.Type]; ok { 88 if err := t.Init(ct.config, ct); err != nil { 89 return errors.Wrapf(err, "transport %q initialize failed", ct.config.Transport.Type) 90 } 91 } else { 92 return errors.Errorf("unknown transport type in configuration: %s", ct.config.Transport.Type) 93 } 94 95 if err := ct.actionSource.Start(ctx); err != nil { 96 return errors.Wrap(err, "initializing HSM agent connection") 97 } 98 99 for i := 0; i < ct.config.Processes; i++ { 100 ct.addHandler(fmt.Sprintf("handler-%d", i)) 101 } 102 103 ct.monitor.Start(ctx) 104 for _, pluginConf := range ct.config.Plugins() { 105 err := ct.monitor.StartPlugin(pluginConf) 106 if err != nil { 107 return errors.Wrapf(err, "creating plugin %q", pluginConf.Name) 108 } 109 } 110 close(ct.startComplete) 111 ct.wg.Wait() 112 close(ct.stopComplete) 113 return nil 114 } 115 116 // Stop shuts down all backend data movers and kills the agent 117 func (ct *HsmAgent) Stop() { 118 ct.mu.Lock() 119 ct.cancelFunc() 120 ct.mu.Unlock() 121 transports[ct.config.Transport.Type].Shutdown() 122 <-ct.stopComplete 123 } 124 125 // StartWaitFor will wait for Agent to startup with time out of n. 126 func (ct *HsmAgent) StartWaitFor(n time.Duration) error { 127 select { 128 case <-ct.startComplete: 129 return nil 130 case <-time.After(n): 131 return errors.Errorf("Agent startup timed out after %v", n) 132 } 133 134 } 135 136 // Root returns a fs.RootDir representing the Lustre filesystem root 137 func (ct *HsmAgent) Root() fs.RootDir { 138 return ct.client.Root() 139 } 140 141 func (ct *HsmAgent) newAction(aih hsm.ActionHandle) *Action { 142 return &Action{ 143 id: NextActionID(), 144 aih: aih, 145 start: time.Now(), 146 agent: ct, 147 } 148 } 149 150 func (ct *HsmAgent) handleActions(tag string) { 151 for ai := range ct.actionSource.Actions() { 152 debug.Printf("%s: incoming: %s", tag, ai) 153 // AFAICT, this is how the copytool is expected to handle cancels. 154 if ai.Action() == llapi.HsmActionCancel { 155 ai.FailImmediately(int(unix.ENOSYS)) 156 // TODO: send out of band cancel message to the mover 157 continue 158 } 159 aih, err := ai.Begin(0, false) 160 if err != nil { 161 alert.Warnf("%s: begin failed: %v: %s", tag, err, ai) 162 ai.FailImmediately(int(unix.EIO)) 163 continue 164 } 165 action := ct.newAction(aih) 166 ct.rpcsInFlight <- struct{}{} 167 ct.stats.StartAction(action) 168 action.Prepare() 169 if e, ok := ct.Endpoints.Get(uint32(aih.ArchiveID())); ok { 170 debug.Printf("%s: id:%d new %s %x %v", tag, action.id, 171 action.aih.Action(), 172 action.aih.Cookie(), 173 action.aih.Fid()) 174 e.Send(action) 175 } else { 176 alert.Warnf("no handler for archive %d", aih.ArchiveID()) 177 action.Fail(-1) 178 ct.stats.CompleteAction(action, -1) 179 } 180 } 181 } 182 183 func (ct *HsmAgent) addHandler(tag string) { 184 ct.wg.Add(1) 185 go func() { 186 ct.handleActions(tag) 187 ct.wg.Done() 188 }() 189 } 190 191 var transports = map[string]Transport{} 192 193 // RegisterTransport registers the transport in the list of known transports 194 func RegisterTransport(name string, t Transport) { 195 transports[name] = t 196 }