github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/memsys/mmsa.go (about)

     1  // Package memsys provides memory management and slab/SGL allocation with io.Reader and io.Writer interfaces
     2  // on top of scatter-gather lists of reusable buffers.
     3  /*
     4   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package memsys
     7  
     8  import (
     9  	"fmt"
    10  	"os"
    11  	"strconv"
    12  	"time"
    13  
    14  	"github.com/NVIDIA/aistore/cmn"
    15  	"github.com/NVIDIA/aistore/cmn/atomic"
    16  	"github.com/NVIDIA/aistore/cmn/cos"
    17  	"github.com/NVIDIA/aistore/cmn/debug"
    18  	"github.com/NVIDIA/aistore/sys"
    19  )
    20  
    21  // ====================== How to run unit tests ===========================
    22  //
    23  // 1. Run all tests with default parameters
    24  // $ go test -v
    25  // 2. ... and debug enabled
    26  // $ go test -v -tags=debug
    27  // 3. ... and deadbeef (build tag) enabled, to "DEADBEEF" every freed buffer
    28  // $ go test -v -tags=debug,deadbeef
    29  // 4. Run a given named test with the specified build tags for 100s
    30  // $ go test -v -tags=debug,deadbeef -run=Test_Sleep -duration=100s
    31  
    32  // ============== Memory Manager Slab Allocator (MMSA) ===========================
    33  //
    34  // MMSA is, simultaneously, a Slab and SGL allocator, and a memory manager
    35  // responsible to optimize memory usage between different (more vs less) utilized
    36  // Slabs.
    37  //
    38  // Multiple MMSA instances may coexist in the system, each having its own
    39  // constraints and managing its own Slabs and SGLs.
    40  //
    41  // There will be use cases, however, when actually running a MMSA instance
    42  // won't be necessary: e.g., when an app utilizes a single (or a few distinct)
    43  // Slab size(s) for the duration of its relatively short lifecycle,
    44  // while at the same time preferring minimal interference with other running apps.
    45  //
    46  // In that sense, a typical initialization sequence includes 2 steps, e.g.:
    47  // 1) construct:
    48  // 	mm := &memsys.MMSA{TimeIval: ..., MinPctFree: ..., Name: ...}
    49  // 2) initialize:
    50  // 	err := mm.Init()
    51  // 	if err != nil {
    52  //		...
    53  // 	}
    54  //
    55  // To free up all memory allocated by a given MMSA instance, use its Terminate() method.
    56  //
    57  // In addition, there are several environment variables that can be used
    58  // (to circumvent the need to change the code, for instance):
    59  // 	"AIS_MINMEM_FREE"
    60  // 	"AIS_MINMEM_PCT_TOTAL"
    61  // 	"AIS_MINMEM_PCT_FREE"
    62  // These names must be self-explanatory.
    63  //
    64  // Once constructed and initialized, memory-manager-and-slab-allocator
    65  // (MMSA) can be exercised via its public API that includes
    66  // GetSlab() and Alloc*() methods.
    67  //
    68  // Once selected, each Slab instance can be used via its own public API that
    69  // includes Alloc() and Free() methods. In addition, each allocated SGL internally
    70  // utilizes one of the existing enumerated slabs to "grow" (that is, allocate more
    71  // buffers from the slab) on demand. For details, look for "grow" in the iosgl.go.
    72  
    73  const readme = cmn.GitHubHome + "/blob/main/memsys/README.md"
    74  
    75  // =================================== tunables ==========================================
    76  // The minimum memory (that must remain available) gets computed as follows:
    77  // 1) environment AIS_MINMEM_FREE takes precedence over everything else;
    78  // 2) if AIS_MINMEM_FREE is not defined, environment variables AIS_MINMEM_PCT_TOTAL and/or
    79  //    AIS_MINMEM_PCT_FREE define percentages to compute the minimum based on total
    80  //    or the currently available memory, respectively;
    81  // 3) with no environment, the minimum is computed based on the following MMSA member variables:
    82  //	* MinFree     uint64        // memory that must be available at all times
    83  //	* MinPctTotal int           // same, via percentage of total
    84  //	* MinPctFree  int           // ditto, as % of free at init time
    85  //     (example:
    86  //         mm := &memsys.MMSA{MinPctTotal: 4, MinFree: cos.GiB * 2}
    87  //     )
    88  //  4) finally, if none of the above is specified, the constant `minMemFree` below is used
    89  //  Other important defaults are also commented below.
    90  // =================================== MMSA config defaults ==========================================
    91  
    92  const (
    93  	PageSize            = cos.KiB * 4
    94  	DefaultBufSize      = PageSize * 8
    95  	DefaultBuf2Size     = PageSize * 16
    96  	DefaultSmallBufSize = cos.KiB
    97  )
    98  
    99  // page slabs: pagesize increments up to MaxPageSlabSize
   100  const (
   101  	MaxPageSlabSize = 128 * cos.KiB
   102  	PageSlabIncStep = PageSize
   103  	NumPageSlabs    = MaxPageSlabSize / PageSlabIncStep // = 32
   104  )
   105  
   106  // small slabs: 128 byte increments up to MaxSmallSlabSize
   107  const (
   108  	MaxSmallSlabSize = PageSize
   109  	SmallSlabIncStep = 128
   110  	NumSmallSlabs    = MaxSmallSlabSize / SmallSlabIncStep // = 32
   111  )
   112  
   113  const NumStats = NumPageSlabs // NOTE: must be >= NumSmallSlabs
   114  
   115  const (
   116  	optDepth = 128  // ring "depth", i.e., num free bufs we trend to (see grow())
   117  	minDepth = 4    // depth when idle or under OOM
   118  	maxDepth = 4096 // exceeding warrants reallocation
   119  
   120  	loadAvg = 10 // "idle" load average to deallocate Slabs when below
   121  )
   122  
   123  const countThreshold = 16 // exceeding this scatter-gather count warrants selecting a larger-(buffer)-size Slab
   124  
   125  const swappingMax = 4 // make sure that `swapping` condition, once noted, lingers for a while
   126  
   127  type (
   128  	Stats struct {
   129  		Hits [NumStats]uint64
   130  		Idle [NumStats]time.Duration
   131  	}
   132  	MMSA struct {
   133  		// public
   134  		MinFree     uint64        // memory that must be available at all times
   135  		TimeIval    time.Duration // interval of time to watch for low memory and make steps
   136  		MinPctTotal int           // same, via percentage of total
   137  		MinPctFree  int           // ditto, as % of free at init time
   138  		Name        string
   139  		// private
   140  		info          string
   141  		sibling       *MMSA
   142  		lowWM         uint64
   143  		rings         []*Slab
   144  		sorted        []*Slab
   145  		slabStats     *slabStats // private counters and idle timestamp
   146  		statsSnapshot *Stats     // pre-allocated limited "snapshot" of slabStats
   147  		slabIncStep   int64
   148  		maxSlabSize   int64
   149  		defBufSize    int64
   150  		mem           sys.MemStat
   151  		numSlabs      int
   152  		// atomic state
   153  		toGC     atomic.Int64 // accumulates over time and triggers GC upon reaching spec-ed limit
   154  		optDepth atomic.Int64 // ring "depth", i.e., num free bufs we trend to (see grow())
   155  		swap     struct {
   156  			size atomic.Uint64 // actual swap size
   157  			crit atomic.Int32  // tracks increasing swap size up to swappingMax const
   158  		}
   159  	}
   160  	FreeSpec struct {
   161  		IdleDuration time.Duration // reduce only the slabs that are idling for at least as much time
   162  		MinSize      int64         // minimum freed size that'd warrant calling GC (default = sizetoGC)
   163  		Totally      bool          // true: free all slabs regardless of their idle-ness and size
   164  		ToOS         bool          // GC and then return the memory to the operating system
   165  	}
   166  	//
   167  	// private
   168  	//
   169  	slabStats struct {
   170  		hits   [NumStats]atomic.Uint64
   171  		prev   [NumStats]uint64
   172  		hinc   [NumStats]uint64
   173  		idleTs [NumStats]atomic.Int64
   174  	}
   175  )
   176  
   177  //////////
   178  // MMSA //
   179  //////////
   180  
   181  func (r *MMSA) String() string {
   182  	var (
   183  		mem sys.MemStat
   184  		err error
   185  	)
   186  	err = mem.Get()
   187  	debug.AssertNoErr(err)
   188  	return r.Str(&mem)
   189  }
   190  
   191  func (r *MMSA) Str(mem *sys.MemStat) string {
   192  	sp := r.pressure2S(r.Pressure(mem))
   193  	if r.info == "" {
   194  		r.info = "(min-free " + cos.ToSizeIEC(int64(r.MinFree), 0) + ", low-wm " + cos.ToSizeIEC(int64(r.lowWM), 0)
   195  	}
   196  	return r.Name + "[(" + mem.String() + "), " + sp + ", " + r.info + "]"
   197  }
   198  
   199  // allocate SGL
   200  //   - immediateSize: known size, OR minimum expected size, OR size to preallocate
   201  //     immediateSize == 0 translates as DefaultBufSize - for page MMSA,
   202  //     and DefaultSmallBufSize - for small-size MMSA
   203  //   - sbufSize: slab buffer size (optional)
   204  func (r *MMSA) NewSGL(immediateSize int64, sbufSize ...int64) *SGL {
   205  	var (
   206  		slab *Slab
   207  		n    int64
   208  		err  error
   209  	)
   210  	// 1. slab
   211  	if len(sbufSize) > 0 {
   212  		slab, err = r.GetSlab(sbufSize[0])
   213  	} else if immediateSize <= r.maxSlabSize {
   214  		// NOTE allocate imm. size in one shot when below max
   215  		if immediateSize == 0 {
   216  			immediateSize = r.defBufSize
   217  		}
   218  		i := cos.DivCeil(immediateSize, r.slabIncStep)
   219  		slab = r.rings[i-1]
   220  	} else {
   221  		slab = r._large2slab(immediateSize)
   222  	}
   223  	debug.AssertNoErr(err)
   224  
   225  	// 2. sgl
   226  	z := _allocSGL(r.isPage())
   227  	z.slab = slab
   228  	n = cos.DivCeil(immediateSize, slab.Size())
   229  	if cap(z.sgl) < int(n) {
   230  		z.sgl = make([][]byte, n)
   231  	} else {
   232  		z.sgl = z.sgl[:n]
   233  	}
   234  	slab.muget.Lock()
   235  	for i := range int(n) {
   236  		z.sgl[i] = slab._alloc()
   237  	}
   238  	slab.muget.Unlock()
   239  	return z
   240  }
   241  
   242  // gets Slab for a given fixed buffer size that must be within expected range of sizes
   243  // - the range supported by _this_ MMSA (compare w/ SelectMemAndSlab())
   244  func (r *MMSA) GetSlab(bufSize int64) (s *Slab, err error) {
   245  	a, b := bufSize/r.slabIncStep, bufSize%r.slabIncStep
   246  	if b != 0 {
   247  		err = fmt.Errorf("memsys: size %d must be a multiple of %d", bufSize, r.slabIncStep)
   248  		return
   249  	}
   250  	if a < 1 || a > int64(r.numSlabs) {
   251  		err = fmt.Errorf("memsys: size %d outside valid range", bufSize)
   252  		return
   253  	}
   254  	s = r.rings[a-1]
   255  	return
   256  }
   257  
   258  // uses SelectMemAndSlab to select both MMSA (page or small) and its Slab
   259  func (r *MMSA) AllocSize(size int64) (buf []byte, slab *Slab) {
   260  	_, slab = r.SelectMemAndSlab(size)
   261  	buf = slab.Alloc()
   262  	return
   263  }
   264  
   265  func (r *MMSA) Alloc() (buf []byte, slab *Slab) {
   266  	size := r.defBufSize
   267  	_, slab = r.SelectMemAndSlab(size)
   268  	buf = slab.Alloc()
   269  	return
   270  }
   271  
   272  func (r *MMSA) Free(buf []byte) {
   273  	size := int64(cap(buf))
   274  	if size > r.maxSlabSize && !r.isPage() {
   275  		r.sibling.Free(buf)
   276  	} else if size < r.slabIncStep && r.isPage() {
   277  		r.sibling.Free(buf)
   278  	} else {
   279  		debug.Assert(size%r.slabIncStep == 0)
   280  		debug.Assert(size/r.slabIncStep <= int64(r.numSlabs))
   281  
   282  		slab := r._selectSlab(size)
   283  		slab.Free(buf)
   284  	}
   285  }
   286  
   287  // Given a known, expected or minimum size to allocate, selects MMSA (page or small, if initialized)
   288  // and its Slab
   289  func (r *MMSA) SelectMemAndSlab(size int64) (mmsa *MMSA, slab *Slab) {
   290  	if size > r.maxSlabSize && !r.isPage() {
   291  		return r.sibling, r.sibling._selectSlab(size)
   292  	}
   293  	if size < r.slabIncStep && r.isPage() {
   294  		return r.sibling, r.sibling._selectSlab(size)
   295  	}
   296  	mmsa, slab = r, r._selectSlab(size)
   297  	return
   298  }
   299  
   300  func (r *MMSA) _selectSlab(size int64) (slab *Slab) {
   301  	if size >= r.maxSlabSize {
   302  		slab = r.rings[len(r.rings)-1]
   303  	} else if size <= r.slabIncStep {
   304  		slab = r.rings[0]
   305  	} else {
   306  		i := (size + r.slabIncStep - 1) / r.slabIncStep
   307  		slab = r.rings[i-1]
   308  	}
   309  	return
   310  }
   311  
   312  func (r *MMSA) Append(buf []byte, bytes string) (nbuf []byte) {
   313  	var (
   314  		ll, l, c = len(buf), len(bytes), cap(buf)
   315  		a        = ll + l - c
   316  	)
   317  	if a > 0 {
   318  		nbuf, _ = r.AllocSize(int64(c + a))
   319  		copy(nbuf, buf)
   320  		r.Free(buf)
   321  		nbuf = nbuf[:ll+l]
   322  	} else {
   323  		nbuf = buf[:ll+l]
   324  	}
   325  	copy(nbuf[ll:], bytes)
   326  	return
   327  }
   328  
   329  // private
   330  
   331  // select slab for SGL given a large immediate size to allocate
   332  func (r *MMSA) _large2slab(immediateSize int64) *Slab {
   333  	size := cos.DivCeil(immediateSize, countThreshold)
   334  	for _, slab := range r.rings {
   335  		if slab.Size() >= size {
   336  			return slab
   337  		}
   338  	}
   339  	return r.rings[len(r.rings)-1]
   340  }
   341  
   342  func (r *MMSA) env() (err error) {
   343  	var minfree int64
   344  	if a := os.Getenv("AIS_MINMEM_FREE"); a != "" {
   345  		if minfree, err = cos.ParseSize(a, cos.UnitsIEC); err != nil {
   346  			return fmt.Errorf("memsys: cannot parse AIS_MINMEM_FREE %q", a)
   347  		}
   348  		r.MinFree = uint64(minfree)
   349  	}
   350  	if a := os.Getenv("AIS_MINMEM_PCT_TOTAL"); a != "" {
   351  		if r.MinPctTotal, err = strconv.Atoi(a); err != nil {
   352  			return fmt.Errorf("memsys: cannot parse AIS_MINMEM_PCT_TOTAL %q", a)
   353  		}
   354  		if r.MinPctTotal < 0 || r.MinPctTotal > 100 {
   355  			return fmt.Errorf("memsys: invalid AIS_MINMEM_PCT_TOTAL %q", a)
   356  		}
   357  	}
   358  	if a := os.Getenv("AIS_MINMEM_PCT_FREE"); a != "" {
   359  		if r.MinPctFree, err = strconv.Atoi(a); err != nil {
   360  			return fmt.Errorf("memsys: cannot parse AIS_MINMEM_PCT_FREE %q", a)
   361  		}
   362  		if r.MinPctFree < 0 || r.MinPctFree > 100 {
   363  			return fmt.Errorf("memsys: invalid AIS_MINMEM_PCT_FREE %q", a)
   364  		}
   365  	}
   366  	return
   367  }