github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/devicemanager/manager.go (about) 1 // Package devicemanager is used to manage device plugins 2 package devicemanager 3 4 import ( 5 "context" 6 "fmt" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 multierror "github.com/hashicorp/go-multierror" 12 plugin "github.com/hashicorp/go-plugin" 13 "github.com/hashicorp/nomad/client/devicemanager/state" 14 "github.com/hashicorp/nomad/client/pluginmanager" 15 "github.com/hashicorp/nomad/helper/pluginutils/loader" 16 "github.com/hashicorp/nomad/nomad/structs" 17 "github.com/hashicorp/nomad/plugins/base" 18 "github.com/hashicorp/nomad/plugins/device" 19 pstructs "github.com/hashicorp/nomad/plugins/shared/structs" 20 ) 21 22 // Manager is the interface used to manage device plugins 23 type Manager interface { 24 pluginmanager.PluginManager 25 26 // Reserve is used to reserve a set of devices 27 Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) 28 29 // AllStats is used to retrieve all the latest statistics for all devices. 30 AllStats() []*device.DeviceGroupStats 31 32 // DeviceStats returns the device statistics for the given device. 33 DeviceStats(d *structs.AllocatedDeviceResource) (*device.DeviceGroupStats, error) 34 } 35 36 // StateStorage is used to persist the device managers state across 37 // agent restarts. 38 type StateStorage interface { 39 // GetDevicePluginState is used to retrieve the device manager's plugin 40 // state. 41 GetDevicePluginState() (*state.PluginState, error) 42 43 // PutDevicePluginState is used to store the device manager's plugin 44 // state. 45 PutDevicePluginState(state *state.PluginState) error 46 } 47 48 // UpdateNodeDevices is a callback for updating the set of devices on a node. 49 type UpdateNodeDevicesFn func(devices []*structs.NodeDeviceResource) 50 51 // StorePluginReattachFn is used to store plugin reattachment configurations. 52 type StorePluginReattachFn func(*plugin.ReattachConfig) error 53 54 // Config is used to configure a device manager 55 type Config struct { 56 // Logger is the logger used by the device manager 57 Logger log.Logger 58 59 // Loader is the plugin loader 60 Loader loader.PluginCatalog 61 62 // PluginConfig is the config passed to the launched plugins 63 PluginConfig *base.AgentConfig 64 65 // Updater is used to update the node when device information changes 66 Updater UpdateNodeDevicesFn 67 68 // StatsInterval is the interval at which to collect statistics 69 StatsInterval time.Duration 70 71 // State is used to manage the device managers state 72 State StateStorage 73 } 74 75 // manager is used to manage a set of device plugins 76 type manager struct { 77 // logger is the logger used by the device manager 78 logger log.Logger 79 80 // state is used to manage the device managers state 81 state StateStorage 82 83 // ctx is used to shutdown the device manager 84 ctx context.Context 85 cancel context.CancelFunc 86 87 // loader is the plugin loader 88 loader loader.PluginCatalog 89 90 // pluginConfig is the config passed to the launched plugins 91 pluginConfig *base.AgentConfig 92 93 // updater is used to update the node when device information changes 94 updater UpdateNodeDevicesFn 95 96 // statsInterval is the duration at which to collect statistics 97 statsInterval time.Duration 98 99 // fingerprintResCh is used to be triggered that there are new devices 100 fingerprintResCh chan struct{} 101 102 // instances is the list of managed devices 103 instances map[loader.PluginID]*instanceManager 104 105 // reattachConfigs stores the plugin reattach configs 106 reattachConfigs map[loader.PluginID]*pstructs.ReattachConfig 107 reattachConfigLock sync.Mutex 108 } 109 110 // New returns a new device manager 111 func New(c *Config) *manager { 112 ctx, cancel := context.WithCancel(context.Background()) 113 return &manager{ 114 logger: c.Logger.Named("device_mgr"), 115 state: c.State, 116 ctx: ctx, 117 cancel: cancel, 118 loader: c.Loader, 119 pluginConfig: c.PluginConfig, 120 updater: c.Updater, 121 statsInterval: c.StatsInterval, 122 instances: make(map[loader.PluginID]*instanceManager), 123 reattachConfigs: make(map[loader.PluginID]*pstructs.ReattachConfig), 124 fingerprintResCh: make(chan struct{}, 1), 125 } 126 } 127 128 // PluginType identifies this manager to the plugin manager and satisfies the PluginManager interface. 129 func (*manager) PluginType() string { return base.PluginTypeDevice } 130 131 // Run starts the device manager. The manager will shutdown any previously 132 // launched plugin and then begin fingerprinting and stats collection on all new 133 // device plugins. 134 func (m *manager) Run() { 135 // Check if there are any plugins that didn't get cleanly shutdown before 136 // and if there are shut them down. 137 m.cleanupStalePlugins() 138 139 // Get device plugins 140 devices := m.loader.Catalog()[base.PluginTypeDevice] 141 if len(devices) == 0 { 142 m.logger.Debug("exiting since there are no device plugins") 143 m.cancel() 144 return 145 } 146 147 for _, d := range devices { 148 id := loader.PluginInfoID(d) 149 storeFn := func(c *plugin.ReattachConfig) error { 150 id := id 151 return m.storePluginReattachConfig(id, c) 152 } 153 m.instances[id] = newInstanceManager(&instanceManagerConfig{ 154 Logger: m.logger, 155 Ctx: m.ctx, 156 Loader: m.loader, 157 StoreReattach: storeFn, 158 PluginConfig: m.pluginConfig, 159 Id: &id, 160 FingerprintOutCh: m.fingerprintResCh, 161 StatsInterval: m.statsInterval, 162 }) 163 } 164 165 // Now start the fingerprint handler 166 go m.fingerprint() 167 } 168 169 // fingerprint is the main fingerprint loop 170 func (m *manager) fingerprint() { 171 for { 172 select { 173 case <-m.ctx.Done(): 174 return 175 case <-m.fingerprintResCh: 176 } 177 178 // Collect the data 179 var fingerprinted []*device.DeviceGroup 180 for _, i := range m.instances { 181 fingerprinted = append(fingerprinted, i.Devices()...) 182 } 183 184 // Convert and update 185 out := make([]*structs.NodeDeviceResource, len(fingerprinted)) 186 for i, f := range fingerprinted { 187 out[i] = convertDeviceGroup(f) 188 } 189 190 // Call the updater 191 m.updater(out) 192 } 193 } 194 195 // Shutdown cleans up all the plugins 196 func (m *manager) Shutdown() { 197 // Cancel the context to stop any requests 198 m.cancel() 199 200 // Go through and shut everything down 201 for _, i := range m.instances { 202 i.cleanup() 203 } 204 } 205 206 func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} { 207 ctx, cancel := context.WithCancel(ctx) 208 go func() { 209 var wg sync.WaitGroup 210 for i := range m.instances { 211 wg.Add(1) 212 go func(instance *instanceManager) { 213 instance.WaitForFirstFingerprint(ctx) 214 wg.Done() 215 }(m.instances[i]) 216 } 217 wg.Wait() 218 cancel() 219 }() 220 return ctx.Done() 221 } 222 223 // Reserve reserves the given allocated device. If the device is unknown, an 224 // UnknownDeviceErr is returned. 225 func (m *manager) Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) { 226 // Go through each plugin and see if it can reserve the resources 227 for _, i := range m.instances { 228 if !i.HasDevices(d) { 229 continue 230 } 231 232 // We found a match so reserve 233 return i.Reserve(d) 234 } 235 236 return nil, UnknownDeviceErrFromAllocated("failed to reserve devices", d) 237 } 238 239 // AllStats returns statistics for all the devices 240 func (m *manager) AllStats() []*device.DeviceGroupStats { 241 // Go through each plugin and collect stats 242 var stats []*device.DeviceGroupStats 243 for _, i := range m.instances { 244 stats = append(stats, i.AllStats()...) 245 } 246 247 return stats 248 } 249 250 // DeviceStats returns the statistics for the passed devices. If the device is unknown, an 251 // UnknownDeviceErr is returned. 252 func (m *manager) DeviceStats(d *structs.AllocatedDeviceResource) (*device.DeviceGroupStats, error) { 253 // Go through each plugin and see if it has the requested devices 254 for _, i := range m.instances { 255 if !i.HasDevices(d) { 256 continue 257 } 258 259 // We found a match so reserve 260 return i.DeviceStats(d), nil 261 } 262 263 return nil, UnknownDeviceErrFromAllocated("failed to collect statistics", d) 264 } 265 266 // cleanupStalePlugins reads the device managers state and shuts down any 267 // previously launched plugin. 268 func (m *manager) cleanupStalePlugins() error { 269 270 // Read the old plugin state 271 s, err := m.state.GetDevicePluginState() 272 if err != nil { 273 return fmt.Errorf("failed to read plugin state: %v", err) 274 } 275 276 // No state was stored so there is nothing to do. 277 if s == nil { 278 return nil 279 } 280 281 // For each plugin go through and try to shut it down 282 var mErr multierror.Error 283 for name, c := range s.ReattachConfigs { 284 rc, err := pstructs.ReattachConfigToGoPlugin(c) 285 if err != nil { 286 multierror.Append(&mErr, fmt.Errorf("failed to convert reattach config: %v", err)) 287 continue 288 } 289 290 instance, err := m.loader.Reattach(name, base.PluginTypeDevice, rc) 291 if err != nil { 292 multierror.Append(&mErr, fmt.Errorf("failed to reattach to plugin %q: %v", name, err)) 293 continue 294 } 295 296 // Kill the instance 297 instance.Kill() 298 } 299 300 return mErr.ErrorOrNil() 301 } 302 303 // storePluginReattachConfig is used as a callback to the instance managers and 304 // persists thhe plugin reattach configurations. 305 func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error { 306 m.reattachConfigLock.Lock() 307 defer m.reattachConfigLock.Unlock() 308 309 // Store the new reattach config 310 m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c) 311 312 // Persist the state 313 s := &state.PluginState{ 314 ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)), 315 } 316 317 for id, c := range m.reattachConfigs { 318 s.ReattachConfigs[id.Name] = c 319 } 320 321 return m.state.PutDevicePluginState(s) 322 }