github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/objectstore.go (about)

     1  // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License.
     2  
     3  package runner
     4  
     5  // This file contains the implementation of storage that can use an internal cache along with the MD5
     6  // hash of the files contents to avoid downloads that are not needed.
     7  
     8  import (
     9  	"bufio"
    10  	"context"
    11  	"fmt"
    12  	"io/ioutil"
    13  	"os"
    14  	"path/filepath"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/go-stack/stack"
    19  	"github.com/jjeffery/kv" // MIT License
    20  
    21  	"github.com/lthibault/jitterbug"
    22  
    23  	"github.com/karlmutch/ccache"
    24  
    25  	"github.com/karlmutch/go-shortid"
    26  
    27  	"github.com/prometheus/client_golang/prometheus"
    28  )
    29  
    30  var (
    31  	cacheHits = prometheus.NewCounterVec(
    32  		prometheus.CounterOpts{
    33  			Name: "runner_cache_hits",
    34  			Help: "Number of artifact cache hits.",
    35  		},
    36  		[]string{"host", "hash"},
    37  	)
    38  	cacheMisses = prometheus.NewCounterVec(
    39  		prometheus.CounterOpts{
    40  			Name: "runner_cache_misses",
    41  			Help: "Number of artifact cache misses.",
    42  		},
    43  		[]string{"host", "hash"},
    44  	)
    45  
    46  	host = ""
    47  )
    48  
    49  func init() {
    50  	host, _ = os.Hostname()
    51  }
    52  
    53  type objStore struct {
    54  	store  Storage
    55  	ErrorC chan kv.Error
    56  }
    57  
    58  // NewObjStore is used to instantiate an object store for the running that includes a cache
    59  //
    60  func NewObjStore(ctx context.Context, spec *StoreOpts, errorC chan kv.Error) (oStore *objStore, err kv.Error) {
    61  	store, err := NewStorage(ctx, spec)
    62  	if err != nil {
    63  		return nil, err
    64  	}
    65  
    66  	return &objStore{
    67  		store:  store,
    68  		ErrorC: errorC,
    69  	}, nil
    70  }
    71  
    72  var (
    73  	backingDir = ""
    74  
    75  	cacheMax      int64
    76  	cacheInit     sync.Once
    77  	cacheInitSync sync.Mutex
    78  	cache         *ccache.Cache
    79  )
    80  
    81  func groom(backingDir string, removedC chan os.FileInfo, errorC chan kv.Error) {
    82  	if cache == nil {
    83  		return
    84  	}
    85  	cachedFiles, err := ioutil.ReadDir(backingDir)
    86  	if err != nil {
    87  
    88  		go func() {
    89  			defer func() {
    90  				recover()
    91  			}()
    92  			select {
    93  			case errorC <- kv.Wrap(err, fmt.Sprintf("cache dir %s refresh failure", backingDir)).With("stack", stack.Trace().TrimRuntime()):
    94  			case <-time.After(time.Second):
    95  				fmt.Printf("%s\n", kv.Wrap(err, fmt.Sprintf("cache dir %s refresh failed", backingDir)).With("stack", stack.Trace().TrimRuntime()))
    96  			}
    97  		}()
    98  		return
    99  	}
   100  
   101  	for _, file := range cachedFiles {
   102  		// Is an expired or missing file in cache data structure, if it is not a directory delete it
   103  		item := cache.Sample(file.Name())
   104  		if item == nil || item.Expired() {
   105  			info, err := os.Stat(filepath.Join(backingDir, file.Name()))
   106  			if err == nil {
   107  				if info.IsDir() {
   108  					continue
   109  				}
   110  				select {
   111  				case removedC <- info:
   112  				case <-time.After(time.Second):
   113  				}
   114  				if err = os.Remove(filepath.Join(backingDir, file.Name())); err != nil {
   115  					select {
   116  					case errorC <- kv.Wrap(err, fmt.Sprintf("cache dir %s remove failed", backingDir)).With("stack", stack.Trace().TrimRuntime()):
   117  					case <-time.After(time.Second):
   118  						fmt.Printf("%s\n", kv.Wrap(err, fmt.Sprintf("cache dir %s remove failed", backingDir)).With("stack", stack.Trace().TrimRuntime()))
   119  					}
   120  				}
   121  			}
   122  		}
   123  	}
   124  }
   125  
   126  // groomDir will scan the in memory cache and if there are files that are on disk
   127  // but not in the cache they will be reaped
   128  //
   129  func groomDir(ctx context.Context, backingDir string, removedC chan os.FileInfo, errorC chan kv.Error) (triggerC chan struct{}) {
   130  	triggerC = make(chan struct{})
   131  
   132  	go func() {
   133  		check := NewTrigger(triggerC, time.Second*30, &jitterbug.Norm{Stdev: time.Second * 3})
   134  		defer check.Stop()
   135  
   136  		for {
   137  			select {
   138  			case <-check.C:
   139  				groom(backingDir, removedC, errorC)
   140  
   141  			case <-ctx.Done():
   142  				return
   143  			}
   144  		}
   145  	}()
   146  
   147  	return triggerC
   148  }
   149  
   150  // ClearObjStore can be used by clients to erase the contents of the object store cache
   151  //
   152  func ClearObjStore() (err kv.Error) {
   153  	// The ccache works by having the in memory tracking cache as the record to truth.  if we
   154  	// delete the files on disk then when they are fetched they will be invalidated.  If they expire
   155  	// then nothing will be done by the groomer
   156  	//
   157  	cachedFiles, errGo := ioutil.ReadDir(backingDir)
   158  	if errGo != nil {
   159  		return kv.Wrap(errGo).With("backingDir", backingDir).With("stack", stack.Trace().TrimRuntime())
   160  	}
   161  	for _, file := range cachedFiles {
   162  		if file.Name()[0] == '.' {
   163  			continue
   164  		}
   165  		info, err := os.Stat(filepath.Join(backingDir, file.Name()))
   166  		if err == nil {
   167  			if info.IsDir() {
   168  				continue
   169  			}
   170  			if err = os.Remove(filepath.Join(backingDir, file.Name())); err != nil {
   171  				return kv.Wrap(err, fmt.Sprintf("cache dir %s remove failed", backingDir)).With("stack", stack.Trace().TrimRuntime())
   172  			}
   173  		}
   174  	}
   175  	return nil
   176  }
   177  
   178  // ObjStoreFootPrint can be used to determine what the cxurrent footprint of the
   179  // artifact cache is
   180  //
   181  func ObjStoreFootPrint() (max int64) {
   182  	return cacheMax
   183  }
   184  
   185  // InitObjStore sets up the backing store for our object store cache.  The size specified
   186  // can be any byte amount.
   187  //
   188  // The triggerC channel is functional when the err value is nil, this channel can be used to manually
   189  // trigger the disk caching sub system
   190  //
   191  func InitObjStore(ctx context.Context, backing string, size int64, removedC chan os.FileInfo, errorC chan kv.Error) (triggerC chan<- struct{}, err kv.Error) {
   192  	if len(backing) == 0 {
   193  		// If we dont have a backing store dont start the cache
   194  		return nil, kv.NewError("empty cache directory name").With("stack", stack.Trace().TrimRuntime())
   195  	}
   196  
   197  	// Also make sure that the specified directory actually exists
   198  	if stat, errGo := os.Stat(backing); errGo != nil || !stat.IsDir() {
   199  		if errGo != nil {
   200  			return nil, kv.Wrap(errGo, "cache directory does not exist").With("backing", backing).With("stack", stack.Trace().TrimRuntime())
   201  		}
   202  		return nil, kv.NewError("cache name specified is not a directory").With("backing", backing).With("stack", stack.Trace().TrimRuntime())
   203  	}
   204  
   205  	// Now load a list of the files in the cache directory which further checks
   206  	// our ability to use the storage
   207  	//
   208  	cachedFiles, errGo := ioutil.ReadDir(backing)
   209  	if errGo != nil {
   210  		return nil, kv.Wrap(errGo, "cache directory not readable").With("backing", backing).With("stack", stack.Trace().TrimRuntime())
   211  	}
   212  
   213  	// Finally try to create and delete a working file
   214  	id, errGo := shortid.Generate()
   215  	if errGo != nil {
   216  		return nil, kv.Wrap(errGo, "cache directory not writable").With("backing", backing).With("stack", stack.Trace().TrimRuntime())
   217  	}
   218  	tmpFile := filepath.Join(backing, id)
   219  
   220  	errGo = ioutil.WriteFile(tmpFile, []byte{0}, 0600)
   221  	if errGo != nil {
   222  		return nil, kv.Wrap(errGo, "cache directory not writable").With("backing", backing).With("stack", stack.Trace().TrimRuntime())
   223  	}
   224  	os.Remove(tmpFile)
   225  
   226  	// When the cache init is called we only want one caller at a time through and they
   227  	// should only call the initializer function once, successfully, retries are permitted.
   228  	//
   229  	cacheInitSync.Lock()
   230  	defer cacheInitSync.Unlock()
   231  
   232  	if cache != nil {
   233  		return nil, kv.Wrap(err, "cache is already initialized").With("stack", stack.Trace().TrimRuntime())
   234  	}
   235  
   236  	// Registry the monitoring items for measurement purposes by external parties,
   237  	// these are only activated if the caching is being used
   238  	if errGo = prometheus.Register(cacheHits); errGo != nil {
   239  		select {
   240  		case errorC <- kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()):
   241  		default:
   242  		}
   243  	}
   244  	if errGo = prometheus.Register(cacheMisses); errGo != nil {
   245  		select {
   246  		case errorC <- kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()):
   247  		default:
   248  		}
   249  	}
   250  
   251  	select {
   252  	case errorC <- kv.NewError("cache enabled").With("stack", stack.Trace().TrimRuntime()):
   253  	default:
   254  	}
   255  
   256  	// Store the backing store directory for the cache
   257  	backingDir = backing
   258  	cacheMax = size
   259  
   260  	// The backing store might have partial downloads inside it.  We should clear those, ignoring kv.
   261  	// and then re-create the partial download directory
   262  	partialDir := filepath.Join(backingDir, ".partial")
   263  	os.RemoveAll(partialDir)
   264  
   265  	if errGo = os.MkdirAll(partialDir, 0700); err != nil {
   266  		return nil, kv.Wrap(errGo, "unable to create the partial downloads dir ", partialDir).With("stack", stack.Trace().TrimRuntime())
   267  	}
   268  
   269  	// Size the cache appropriately, and track items that are in use through to their being released,
   270  	// which prevents items being read from being groomed and then new copies of the same
   271  	// data appearing
   272  	cache = ccache.New(ccache.Configure().MaxSize(size).GetsPerPromote(1).ItemsToPrune(1))
   273  
   274  	// Now populate the lookaside cache with the files found in the cache directory and their sizes
   275  	for i, file := range cachedFiles {
   276  		if file.IsDir() {
   277  			continue
   278  		}
   279  		if file.Name()[0] != '.' {
   280  			cache.Fetch(file.Name(), time.Hour*48,
   281  				func() (interface{}, error) {
   282  					return cachedFiles[i], nil
   283  				})
   284  		}
   285  	}
   286  
   287  	// Now start the directory groomer
   288  	cacheInit.Do(func() {
   289  		triggerC = groomDir(ctx, backingDir, removedC, errorC)
   290  	})
   291  
   292  	return triggerC, nil
   293  }
   294  
   295  // CacheProbe can be used to test the validity of the cache for a previously cached item.
   296  //
   297  func CacheProbe(key string) bool {
   298  	return cache.Get(key) != nil && !cache.Get(key).Expired()
   299  }
   300  
   301  // Hash will return the hash of a stored file or other blob.  This method can be used
   302  // by a caching layer or by a client to obtain the unique content based identity of the
   303  // resource being stored.
   304  //
   305  func (s *objStore) Hash(ctx context.Context, name string) (hash string, err kv.Error) {
   306  	return s.store.Hash(ctx, name)
   307  }
   308  
   309  // Gather is used to retrieve files prefixed with a specific key.  It is used to retrieve the individual files
   310  // associated with a previous Hoard operation
   311  //
   312  func (s *objStore) Gather(ctx context.Context, keyPrefix string, outputDir string) (warnings []kv.Error, err kv.Error) {
   313  	// Retrieve individual files, without using the cache, tap is set to nil
   314  	return s.store.Gather(ctx, keyPrefix, outputDir, nil)
   315  }
   316  
   317  // Fetch is used by client to retrieve resources from a concrete storage system.  This function will
   318  // invoke storage system logic that may retrieve resources from a cache.
   319  //
   320  func (s *objStore) Fetch(ctx context.Context, name string, unpack bool, output string) (warns []kv.Error, err kv.Error) {
   321  	// Check for meta data, MD5, from the upstream and then examine our cache for a match
   322  	hash, err := s.store.Hash(ctx, name)
   323  	if err != nil {
   324  		return warns, err
   325  	}
   326  
   327  	// If there is no cache simply download the file, and so we supply a nil for the tap
   328  	// for our tap
   329  	if len(backingDir) == 0 {
   330  		cacheMisses.With(prometheus.Labels{"host": host, "hash": hash}).Inc()
   331  		return s.store.Fetch(ctx, name, unpack, output, nil)
   332  	}
   333  
   334  	// triggers LRU to elevate the item being retrieved
   335  	if len(hash) != 0 {
   336  		if item := cache.Get(hash); item != nil {
   337  			if !item.Expired() {
   338  				item.Extend(48 * time.Hour)
   339  			}
   340  		}
   341  	}
   342  
   343  	startTime := time.Now()
   344  
   345  	// Define a time period on which we repeat checking for the presence of a partial
   346  	// download that is for the artifact we are waiting for and before we recheck for
   347  	// the continued presence of the artifact
   348  	waitOnPartial := time.Duration(33 * time.Second)
   349  
   350  	// If there is caching we should loop until we have a good file in the cache, and
   351  	// if appropriate based on the contents of the partial download directory be doing
   352  	// or waiting for the download to happen, respecting the notion that only one of
   353  	// the waiters should be downloading actively
   354  	//
   355  	downloader := false
   356  
   357  	// Loop termination conditions include a timeout and successful completion
   358  	// of the download
   359  	for {
   360  		// Examine the local file cache and use the file from there if present
   361  		localName := filepath.Join(backingDir, hash)
   362  		if _, errGo := os.Stat(localName); errGo == nil {
   363  			spec := StoreOpts{
   364  				Art: &Artifact{
   365  					Qualified: fmt.Sprintf("file:///%s", localName),
   366  				},
   367  				Validate: true,
   368  			}
   369  			localFS, err := NewStorage(ctx, &spec)
   370  			if err != nil {
   371  				return warns, err
   372  			}
   373  			// Because the file is already in the cache we dont supply a tap here
   374  			w, err := localFS.Fetch(ctx, localName, unpack, output, nil)
   375  			if err == nil {
   376  				cacheHits.With(prometheus.Labels{"host": host, "hash": hash}).Inc()
   377  				return warns, nil
   378  			}
   379  
   380  			// Drops through to allow for a fresh download, after saving the errors
   381  			// as warnings for the caller so that caching failures can be observed
   382  			// and diagnosed
   383  			for _, warn := range w {
   384  				warns = append(warns, warn)
   385  			}
   386  			warns = append(warns, err)
   387  		}
   388  		cacheMisses.With(prometheus.Labels{"host": host, "hash": hash}).Inc()
   389  
   390  		if ctx.Err() != nil {
   391  			if downloader {
   392  				return warns, kv.NewError("downloading artifact terminated").With("stack", stack.Trace().TrimRuntime()).With("file", name)
   393  			}
   394  			return warns, kv.NewError("waiting for artifact terminated").With("stack", stack.Trace().TrimRuntime()).With("file", name)
   395  		}
   396  		downloader = false
   397  
   398  		// Look for partial downloads, if a downloader is found then wait for the file to appear
   399  		// inside the main directory
   400  		//
   401  		partial := filepath.Join(backingDir, ".partial", hash)
   402  		if _, errGo := os.Stat(partial); errGo == nil {
   403  			select {
   404  			case <-ctx.Done():
   405  				return warns, err
   406  			case <-time.After(waitOnPartial):
   407  				warn := kv.NewError("pending").With("since", time.Now().Sub(startTime).String(), "partial", partial, "file", name, "stack", stack.Trace().TrimRuntime())
   408  				warns = append(warns, warn)
   409  			}
   410  			continue
   411  		}
   412  
   413  		// If there is no partial file yet try to create a partial file with
   414  		// the exclusive and create flags set which avoids two threads
   415  		// creating the file on top of each other
   416  		//
   417  		file, errGo := os.OpenFile(partial, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
   418  		if errGo != nil {
   419  			select {
   420  			case s.ErrorC <- kv.Wrap(errGo, "file open failure").With("stack", stack.Trace().TrimRuntime()).With("file", partial):
   421  			case <-ctx.Done():
   422  				return warns, err
   423  			default:
   424  			}
   425  			select {
   426  			case <-ctx.Done():
   427  				return warns, err
   428  			case <-time.After(waitOnPartial):
   429  				warn := kv.Wrap(errGo).With("since", time.Now().Sub(startTime).String(), "partial", partial, "file", name, "stack", stack.Trace().TrimRuntime())
   430  				warns = append(warns, warn)
   431  			}
   432  			continue
   433  		}
   434  		downloader = true
   435  
   436  		tapWriter := bufio.NewWriter(file)
   437  
   438  		// Having gained the file to download into call the fetch method and supply the io.WriteClose
   439  		// to the concrete downloader
   440  		//
   441  		w, err := s.store.Fetch(ctx, name, unpack, output, tapWriter)
   442  
   443  		tapWriter.Flush()
   444  		file.Close()
   445  
   446  		// Save warnings from intermediate components, even if there are no
   447  		// unrecoverable errors
   448  		for _, warn := range w {
   449  			warns = append(warns, warn)
   450  		}
   451  
   452  		if err == nil {
   453  			info, errGo := os.Stat(partial)
   454  			if errGo == nil {
   455  				cache.Fetch(info.Name(), time.Hour*48,
   456  					func() (interface{}, error) {
   457  						return info, nil
   458  					})
   459  			} else {
   460  				select {
   461  				case <-ctx.Done():
   462  					return warns, err
   463  				case s.ErrorC <- kv.Wrap(errGo, "file cache failure").With("stack", stack.Trace().TrimRuntime()).With("file", partial).With("file", localName):
   464  				default:
   465  				}
   466  			}
   467  			// Move the downloaded file from .partial into our base cache directory,
   468  			// and need to handle the file from the applications perspective is done
   469  			// by the Fetch, if the rename files there is nothing we can do about it
   470  			// so simply continue as the application will have the data anyway
   471  			if errGo = os.Rename(partial, localName); errGo != nil {
   472  				select {
   473  				case s.ErrorC <- kv.Wrap(errGo, "file rename failure").With("stack", stack.Trace().TrimRuntime()).With("file", partial).With("file", localName):
   474  				default:
   475  				}
   476  			}
   477  
   478  			return warns, nil
   479  		}
   480  		select {
   481  		case s.ErrorC <- err:
   482  		default:
   483  		}
   484  		// If we had a working file get rid of it, this is because leaving it in place will
   485  		// block further download attempts
   486  		if errGo = os.Remove(partial); errGo != nil {
   487  			warn := kv.Wrap(errGo).With("since", time.Now().Sub(startTime).String(), "partial", partial, "file", name, "stack", stack.Trace().TrimRuntime())
   488  			warns = append(warns, warn)
   489  		}
   490  
   491  		select {
   492  		case <-ctx.Done():
   493  			return warns, err
   494  		case <-time.After(waitOnPartial):
   495  			warn := kv.NewError("reattempting").With("since", time.Now().Sub(startTime).String(), "partial", partial, "file", name, "stack", stack.Trace().TrimRuntime())
   496  			warns = append(warns, warn)
   497  		}
   498  	} // End of for {}
   499  	// unreachable
   500  }
   501  
   502  // Hoard is used to place a directory with individual files into the storage resource within the storage implemented
   503  // by a specific implementation.
   504  //
   505  func (s *objStore) Hoard(ctx context.Context, srcDir string, destPrefix string) (warns []kv.Error, err kv.Error) {
   506  	// Place an item into the cache
   507  	return s.store.Hoard(ctx, srcDir, destPrefix)
   508  }
   509  
   510  // Deposit is used to place a file or other storage resource within the storage implemented
   511  // by a specific implementation.
   512  //
   513  func (s *objStore) Deposit(ctx context.Context, src string, dest string) (warns []kv.Error, err kv.Error) {
   514  	// Place an item into the cache
   515  	return s.store.Deposit(ctx, src, dest)
   516  }
   517  
   518  // Close is used to clean up any resources allocated to the storage by calling the implementation Close
   519  // method.
   520  //
   521  func (s *objStore) Close() {
   522  	s.store.Close()
   523  }