github.com/prattmic/llgo-embedded@v0.0.0-20150820070356-41cfecea0e1e/third_party/gofrontend/libgo/runtime/mgc0.c (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Garbage collector (GC).
     6  //
     7  // GC is:
     8  // - mark&sweep
     9  // - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
    10  // - parallel (up to MaxGcproc threads)
    11  // - partially concurrent (mark is stop-the-world, while sweep is concurrent)
    12  // - non-moving/non-compacting
    13  // - full (non-partial)
    14  //
    15  // GC rate.
    16  // Next GC is after we've allocated an extra amount of memory proportional to
    17  // the amount already in use. The proportion is controlled by GOGC environment variable
    18  // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
    19  // (this mark is tracked in next_gc variable). This keeps the GC cost in linear
    20  // proportion to the allocation cost. Adjusting GOGC just changes the linear constant
    21  // (and also the amount of extra memory used).
    22  //
    23  // Concurrent sweep.
    24  // The sweep phase proceeds concurrently with normal program execution.
    25  // The heap is swept span-by-span both lazily (when a goroutine needs another span)
    26  // and concurrently in a background goroutine (this helps programs that are not CPU bound).
    27  // However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
    28  // and so next_gc calculation is tricky and happens as follows.
    29  // At the end of the stop-the-world phase next_gc is conservatively set based on total
    30  // heap size; all spans are marked as "needs sweeping".
    31  // Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
    32  // The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
    33  // closer to the target value. However, this is not enough to avoid over-allocating memory.
    34  // Consider that a goroutine wants to allocate a new span for a large object and
    35  // there are no free swept spans, but there are small-object unswept spans.
    36  // If the goroutine naively allocates a new span, it can surpass the yet-unknown
    37  // target next_gc value. In order to prevent such cases (1) when a goroutine needs
    38  // to allocate a new small-object span, it sweeps small-object spans for the same
    39  // object size until it frees at least one object; (2) when a goroutine needs to
    40  // allocate large-object span from heap, it sweeps spans until it frees at least
    41  // that many pages into heap. Together these two measures ensure that we don't surpass
    42  // target next_gc value by a large margin. There is an exception: if a goroutine sweeps
    43  // and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
    44  // but there can still be other one-page unswept spans which could be combined into a two-page span.
    45  // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
    46  // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
    47  // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
    48  // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
    49  // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
    50  // The finalizer goroutine is kicked off only when all spans are swept.
    51  // When the next GC starts, it sweeps all not-yet-swept spans (if any).
    52  
    53  #include <unistd.h>
    54  
    55  #include "runtime.h"
    56  #include "arch.h"
    57  #include "malloc.h"
    58  #include "mgc0.h"
    59  #include "chan.h"
    60  #include "go-type.h"
    61  
    62  // Map gccgo field names to gc field names.
    63  // Slice aka __go_open_array.
    64  #define array __values
    65  #define cap __capacity
    66  // Iface aka __go_interface
    67  #define tab __methods
    68  // Hmap aka __go_map
    69  typedef struct __go_map Hmap;
    70  // Type aka __go_type_descriptor
    71  #define string __reflection
    72  #define KindPtr GO_PTR
    73  #define KindNoPointers GO_NO_POINTERS
    74  #define kindMask GO_CODE_MASK
    75  // PtrType aka __go_ptr_type
    76  #define elem __element_type
    77  
    78  #ifdef USING_SPLIT_STACK
    79  
    80  extern void * __splitstack_find (void *, void *, size_t *, void **, void **,
    81  				 void **);
    82  
    83  extern void * __splitstack_find_context (void *context[10], size_t *, void **,
    84  					 void **, void **);
    85  
    86  #endif
    87  
    88  enum {
    89  	Debug = 0,
    90  	CollectStats = 0,
    91  	ConcurrentSweep = 1,
    92  
    93  	WorkbufSize	= 16*1024,
    94  	FinBlockSize	= 4*1024,
    95  
    96  	handoffThreshold = 4,
    97  	IntermediateBufferCapacity = 64,
    98  
    99  	// Bits in type information
   100  	PRECISE = 1,
   101  	LOOP = 2,
   102  	PC_BITS = PRECISE | LOOP,
   103  
   104  	RootData	= 0,
   105  	RootBss		= 1,
   106  	RootFinalizers	= 2,
   107  	RootSpanTypes	= 3,
   108  	RootFlushCaches = 4,
   109  	RootCount	= 5,
   110  };
   111  
   112  #define GcpercentUnknown (-2)
   113  
   114  // Initialized from $GOGC.  GOGC=off means no gc.
   115  static int32 gcpercent = GcpercentUnknown;
   116  
   117  static FuncVal* poolcleanup;
   118  
   119  void sync_runtime_registerPoolCleanup(FuncVal*)
   120    __asm__ (GOSYM_PREFIX "sync.runtime_registerPoolCleanup");
   121  
   122  void
   123  sync_runtime_registerPoolCleanup(FuncVal *f)
   124  {
   125  	poolcleanup = f;
   126  }
   127  
   128  static void
   129  clearpools(void)
   130  {
   131  	P *p, **pp;
   132  	MCache *c;
   133  
   134  	// clear sync.Pool's
   135  	if(poolcleanup != nil) {
   136  		__builtin_call_with_static_chain(poolcleanup->fn(),
   137  						 poolcleanup);
   138  	}
   139  
   140  	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
   141  		// clear tinyalloc pool
   142  		c = p->mcache;
   143  		if(c != nil) {
   144  			c->tiny = nil;
   145  			c->tinysize = 0;
   146  		}
   147  		// clear defer pools
   148  		p->deferpool = nil;
   149  	}
   150  }
   151  
   152  // Holding worldsema grants an M the right to try to stop the world.
   153  // The procedure is:
   154  //
   155  //	runtime_semacquire(&runtime_worldsema);
   156  //	m->gcing = 1;
   157  //	runtime_stoptheworld();
   158  //
   159  //	... do stuff ...
   160  //
   161  //	m->gcing = 0;
   162  //	runtime_semrelease(&runtime_worldsema);
   163  //	runtime_starttheworld();
   164  //
   165  uint32 runtime_worldsema = 1;
   166  
   167  typedef struct Workbuf Workbuf;
   168  struct Workbuf
   169  {
   170  #define SIZE (WorkbufSize-sizeof(LFNode)-sizeof(uintptr))
   171  	LFNode  node; // must be first
   172  	uintptr nobj;
   173  	Obj     obj[SIZE/sizeof(Obj) - 1];
   174  	uint8   _padding[SIZE%sizeof(Obj) + sizeof(Obj)];
   175  #undef SIZE
   176  };
   177  
   178  typedef struct Finalizer Finalizer;
   179  struct Finalizer
   180  {
   181  	FuncVal *fn;
   182  	void *arg;
   183  	const struct __go_func_type *ft;
   184  	const PtrType *ot;
   185  };
   186  
   187  typedef struct FinBlock FinBlock;
   188  struct FinBlock
   189  {
   190  	FinBlock *alllink;
   191  	FinBlock *next;
   192  	int32 cnt;
   193  	int32 cap;
   194  	Finalizer fin[1];
   195  };
   196  
   197  static Lock	finlock;	// protects the following variables
   198  static FinBlock	*finq;		// list of finalizers that are to be executed
   199  static FinBlock	*finc;		// cache of free blocks
   200  static FinBlock	*allfin;	// list of all blocks
   201  bool	runtime_fingwait;
   202  bool	runtime_fingwake;
   203  
   204  static Lock	gclock;
   205  static G*	fing;
   206  
   207  static void	runfinq(void*);
   208  static void	bgsweep(void*);
   209  static Workbuf* getempty(Workbuf*);
   210  static Workbuf* getfull(Workbuf*);
   211  static void	putempty(Workbuf*);
   212  static Workbuf* handoff(Workbuf*);
   213  static void	gchelperstart(void);
   214  static void	flushallmcaches(void);
   215  static void	addstackroots(G *gp, Workbuf **wbufp);
   216  
   217  static struct {
   218  	uint64	full;  // lock-free list of full blocks
   219  	uint64	empty; // lock-free list of empty blocks
   220  	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
   221  	uint32	nproc;
   222  	int64	tstart;
   223  	volatile uint32	nwait;
   224  	volatile uint32	ndone;
   225  	Note	alldone;
   226  	ParFor	*markfor;
   227  
   228  	Lock	lock;
   229  	byte	*chunk;
   230  	uintptr	nchunk;
   231  } work __attribute__((aligned(8)));
   232  
   233  enum {
   234  	GC_DEFAULT_PTR = GC_NUM_INSTR,
   235  	GC_CHAN,
   236  
   237  	GC_NUM_INSTR2
   238  };
   239  
   240  static struct {
   241  	struct {
   242  		uint64 sum;
   243  		uint64 cnt;
   244  	} ptr;
   245  	uint64 nbytes;
   246  	struct {
   247  		uint64 sum;
   248  		uint64 cnt;
   249  		uint64 notype;
   250  		uint64 typelookup;
   251  	} obj;
   252  	uint64 rescan;
   253  	uint64 rescanbytes;
   254  	uint64 instr[GC_NUM_INSTR2];
   255  	uint64 putempty;
   256  	uint64 getfull;
   257  	struct {
   258  		uint64 foundbit;
   259  		uint64 foundword;
   260  		uint64 foundspan;
   261  	} flushptrbuf;
   262  	struct {
   263  		uint64 foundbit;
   264  		uint64 foundword;
   265  		uint64 foundspan;
   266  	} markonly;
   267  	uint32 nbgsweep;
   268  	uint32 npausesweep;
   269  } gcstats;
   270  
   271  // markonly marks an object. It returns true if the object
   272  // has been marked by this function, false otherwise.
   273  // This function doesn't append the object to any buffer.
   274  static bool
   275  markonly(const void *obj)
   276  {
   277  	byte *p;
   278  	uintptr *bitp, bits, shift, x, xbits, off, j;
   279  	MSpan *s;
   280  	PageID k;
   281  
   282  	// Words outside the arena cannot be pointers.
   283  	if((const byte*)obj < runtime_mheap.arena_start || (const byte*)obj >= runtime_mheap.arena_used)
   284  		return false;
   285  
   286  	// obj may be a pointer to a live object.
   287  	// Try to find the beginning of the object.
   288  
   289  	// Round down to word boundary.
   290  	obj = (const void*)((uintptr)obj & ~((uintptr)PtrSize-1));
   291  
   292  	// Find bits for this word.
   293  	off = (const uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
   294  	bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
   295  	shift = off % wordsPerBitmapWord;
   296  	xbits = *bitp;
   297  	bits = xbits >> shift;
   298  
   299  	// Pointing at the beginning of a block?
   300  	if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
   301  		if(CollectStats)
   302  			runtime_xadd64(&gcstats.markonly.foundbit, 1);
   303  		goto found;
   304  	}
   305  
   306  	// Pointing just past the beginning?
   307  	// Scan backward a little to find a block boundary.
   308  	for(j=shift; j-->0; ) {
   309  		if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
   310  			shift = j;
   311  			bits = xbits>>shift;
   312  			if(CollectStats)
   313  				runtime_xadd64(&gcstats.markonly.foundword, 1);
   314  			goto found;
   315  		}
   316  	}
   317  
   318  	// Otherwise consult span table to find beginning.
   319  	// (Manually inlined copy of MHeap_LookupMaybe.)
   320  	k = (uintptr)obj>>PageShift;
   321  	x = k;
   322  	x -= (uintptr)runtime_mheap.arena_start>>PageShift;
   323  	s = runtime_mheap.spans[x];
   324  	if(s == nil || k < s->start || (const byte*)obj >= s->limit || s->state != MSpanInUse)
   325  		return false;
   326  	p = (byte*)((uintptr)s->start<<PageShift);
   327  	if(s->sizeclass == 0) {
   328  		obj = p;
   329  	} else {
   330  		uintptr size = s->elemsize;
   331  		int32 i = ((const byte*)obj - p)/size;
   332  		obj = p+i*size;
   333  	}
   334  
   335  	// Now that we know the object header, reload bits.
   336  	off = (const uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
   337  	bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
   338  	shift = off % wordsPerBitmapWord;
   339  	xbits = *bitp;
   340  	bits = xbits >> shift;
   341  	if(CollectStats)
   342  		runtime_xadd64(&gcstats.markonly.foundspan, 1);
   343  
   344  found:
   345  	// Now we have bits, bitp, and shift correct for
   346  	// obj pointing at the base of the object.
   347  	// Only care about allocated and not marked.
   348  	if((bits & (bitAllocated|bitMarked)) != bitAllocated)
   349  		return false;
   350  	if(work.nproc == 1)
   351  		*bitp |= bitMarked<<shift;
   352  	else {
   353  		for(;;) {
   354  			x = *bitp;
   355  			if(x & (bitMarked<<shift))
   356  				return false;
   357  			if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
   358  				break;
   359  		}
   360  	}
   361  
   362  	// The object is now marked
   363  	return true;
   364  }
   365  
   366  // PtrTarget is a structure used by intermediate buffers.
   367  // The intermediate buffers hold GC data before it
   368  // is moved/flushed to the work buffer (Workbuf).
   369  // The size of an intermediate buffer is very small,
   370  // such as 32 or 64 elements.
   371  typedef struct PtrTarget PtrTarget;
   372  struct PtrTarget
   373  {
   374  	void *p;
   375  	uintptr ti;
   376  };
   377  
   378  typedef	struct Scanbuf Scanbuf;
   379  struct	Scanbuf
   380  {
   381  	struct {
   382  		PtrTarget *begin;
   383  		PtrTarget *end;
   384  		PtrTarget *pos;
   385  	} ptr;
   386  	struct {
   387  		Obj *begin;
   388  		Obj *end;
   389  		Obj *pos;
   390  	} obj;
   391  	Workbuf *wbuf;
   392  	Obj *wp;
   393  	uintptr nobj;
   394  };
   395  
   396  typedef struct BufferList BufferList;
   397  struct BufferList
   398  {
   399  	PtrTarget ptrtarget[IntermediateBufferCapacity];
   400  	Obj obj[IntermediateBufferCapacity];
   401  	uint32 busy;
   402  	byte pad[CacheLineSize];
   403  };
   404  static BufferList bufferList[MaxGcproc];
   405  
   406  static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
   407  
   408  // flushptrbuf moves data from the PtrTarget buffer to the work buffer.
   409  // The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned,
   410  // while the work buffer contains blocks which have been marked
   411  // and are prepared to be scanned by the garbage collector.
   412  //
   413  // _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
   414  //
   415  // A simplified drawing explaining how the todo-list moves from a structure to another:
   416  //
   417  //     scanblock
   418  //  (find pointers)
   419  //    Obj ------> PtrTarget (pointer targets)
   420  //     ↑          |
   421  //     |          |
   422  //     `----------'
   423  //     flushptrbuf
   424  //  (find block start, mark and enqueue)
   425  static void
   426  flushptrbuf(Scanbuf *sbuf)
   427  {
   428  	byte *p, *arena_start, *obj;
   429  	uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n;
   430  	MSpan *s;
   431  	PageID k;
   432  	Obj *wp;
   433  	Workbuf *wbuf;
   434  	PtrTarget *ptrbuf;
   435  	PtrTarget *ptrbuf_end;
   436  
   437  	arena_start = runtime_mheap.arena_start;
   438  
   439  	wp = sbuf->wp;
   440  	wbuf = sbuf->wbuf;
   441  	nobj = sbuf->nobj;
   442  
   443  	ptrbuf = sbuf->ptr.begin;
   444  	ptrbuf_end = sbuf->ptr.pos;
   445  	n = ptrbuf_end - sbuf->ptr.begin;
   446  	sbuf->ptr.pos = sbuf->ptr.begin;
   447  
   448  	if(CollectStats) {
   449  		runtime_xadd64(&gcstats.ptr.sum, n);
   450  		runtime_xadd64(&gcstats.ptr.cnt, 1);
   451  	}
   452  
   453  	// If buffer is nearly full, get a new one.
   454  	if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
   455  		if(wbuf != nil)
   456  			wbuf->nobj = nobj;
   457  		wbuf = getempty(wbuf);
   458  		wp = wbuf->obj;
   459  		nobj = 0;
   460  
   461  		if(n >= nelem(wbuf->obj))
   462  			runtime_throw("ptrbuf has to be smaller than WorkBuf");
   463  	}
   464  
   465  	while(ptrbuf < ptrbuf_end) {
   466  		obj = ptrbuf->p;
   467  		ti = ptrbuf->ti;
   468  		ptrbuf++;
   469  
   470  		// obj belongs to interval [mheap.arena_start, mheap.arena_used).
   471  		if(Debug > 1) {
   472  			if(obj < runtime_mheap.arena_start || obj >= runtime_mheap.arena_used)
   473  				runtime_throw("object is outside of mheap");
   474  		}
   475  
   476  		// obj may be a pointer to a live object.
   477  		// Try to find the beginning of the object.
   478  
   479  		// Round down to word boundary.
   480  		if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) {
   481  			obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
   482  			ti = 0;
   483  		}
   484  
   485  		// Find bits for this word.
   486  		off = (uintptr*)obj - (uintptr*)arena_start;
   487  		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
   488  		shift = off % wordsPerBitmapWord;
   489  		xbits = *bitp;
   490  		bits = xbits >> shift;
   491  
   492  		// Pointing at the beginning of a block?
   493  		if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
   494  			if(CollectStats)
   495  				runtime_xadd64(&gcstats.flushptrbuf.foundbit, 1);
   496  			goto found;
   497  		}
   498  
   499  		ti = 0;
   500  
   501  		// Pointing just past the beginning?
   502  		// Scan backward a little to find a block boundary.
   503  		for(j=shift; j-->0; ) {
   504  			if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
   505  				obj = (byte*)obj - (shift-j)*PtrSize;
   506  				shift = j;
   507  				bits = xbits>>shift;
   508  				if(CollectStats)
   509  					runtime_xadd64(&gcstats.flushptrbuf.foundword, 1);
   510  				goto found;
   511  			}
   512  		}
   513  
   514  		// Otherwise consult span table to find beginning.
   515  		// (Manually inlined copy of MHeap_LookupMaybe.)
   516  		k = (uintptr)obj>>PageShift;
   517  		x = k;
   518  		x -= (uintptr)arena_start>>PageShift;
   519  		s = runtime_mheap.spans[x];
   520  		if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse)
   521  			continue;
   522  		p = (byte*)((uintptr)s->start<<PageShift);
   523  		if(s->sizeclass == 0) {
   524  			obj = p;
   525  		} else {
   526  			size = s->elemsize;
   527  			int32 i = ((byte*)obj - p)/size;
   528  			obj = p+i*size;
   529  		}
   530  
   531  		// Now that we know the object header, reload bits.
   532  		off = (uintptr*)obj - (uintptr*)arena_start;
   533  		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
   534  		shift = off % wordsPerBitmapWord;
   535  		xbits = *bitp;
   536  		bits = xbits >> shift;
   537  		if(CollectStats)
   538  			runtime_xadd64(&gcstats.flushptrbuf.foundspan, 1);
   539  
   540  	found:
   541  		// Now we have bits, bitp, and shift correct for
   542  		// obj pointing at the base of the object.
   543  		// Only care about allocated and not marked.
   544  		if((bits & (bitAllocated|bitMarked)) != bitAllocated)
   545  			continue;
   546  		if(work.nproc == 1)
   547  			*bitp |= bitMarked<<shift;
   548  		else {
   549  			for(;;) {
   550  				x = *bitp;
   551  				if(x & (bitMarked<<shift))
   552  					goto continue_obj;
   553  				if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
   554  					break;
   555  			}
   556  		}
   557  
   558  		// If object has no pointers, don't need to scan further.
   559  		if((bits & bitScan) == 0)
   560  			continue;
   561  
   562  		// Ask span about size class.
   563  		// (Manually inlined copy of MHeap_Lookup.)
   564  		x = (uintptr)obj >> PageShift;
   565  		x -= (uintptr)arena_start>>PageShift;
   566  		s = runtime_mheap.spans[x];
   567  
   568  		PREFETCH(obj);
   569  
   570  		*wp = (Obj){obj, s->elemsize, ti};
   571  		wp++;
   572  		nobj++;
   573  	continue_obj:;
   574  	}
   575  
   576  	// If another proc wants a pointer, give it some.
   577  	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
   578  		wbuf->nobj = nobj;
   579  		wbuf = handoff(wbuf);
   580  		nobj = wbuf->nobj;
   581  		wp = wbuf->obj + nobj;
   582  	}
   583  
   584  	sbuf->wp = wp;
   585  	sbuf->wbuf = wbuf;
   586  	sbuf->nobj = nobj;
   587  }
   588  
   589  static void
   590  flushobjbuf(Scanbuf *sbuf)
   591  {
   592  	uintptr nobj, off;
   593  	Obj *wp, obj;
   594  	Workbuf *wbuf;
   595  	Obj *objbuf;
   596  	Obj *objbuf_end;
   597  
   598  	wp = sbuf->wp;
   599  	wbuf = sbuf->wbuf;
   600  	nobj = sbuf->nobj;
   601  
   602  	objbuf = sbuf->obj.begin;
   603  	objbuf_end = sbuf->obj.pos;
   604  	sbuf->obj.pos = sbuf->obj.begin;
   605  
   606  	while(objbuf < objbuf_end) {
   607  		obj = *objbuf++;
   608  
   609  		// Align obj.b to a word boundary.
   610  		off = (uintptr)obj.p & (PtrSize-1);
   611  		if(off != 0) {
   612  			obj.p += PtrSize - off;
   613  			obj.n -= PtrSize - off;
   614  			obj.ti = 0;
   615  		}
   616  
   617  		if(obj.p == nil || obj.n == 0)
   618  			continue;
   619  
   620  		// If buffer is full, get a new one.
   621  		if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
   622  			if(wbuf != nil)
   623  				wbuf->nobj = nobj;
   624  			wbuf = getempty(wbuf);
   625  			wp = wbuf->obj;
   626  			nobj = 0;
   627  		}
   628  
   629  		*wp = obj;
   630  		wp++;
   631  		nobj++;
   632  	}
   633  
   634  	// If another proc wants a pointer, give it some.
   635  	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
   636  		wbuf->nobj = nobj;
   637  		wbuf = handoff(wbuf);
   638  		nobj = wbuf->nobj;
   639  		wp = wbuf->obj + nobj;
   640  	}
   641  
   642  	sbuf->wp = wp;
   643  	sbuf->wbuf = wbuf;
   644  	sbuf->nobj = nobj;
   645  }
   646  
   647  // Program that scans the whole block and treats every block element as a potential pointer
   648  static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
   649  
   650  // Hchan program
   651  static uintptr chanProg[2] = {0, GC_CHAN};
   652  
   653  // Local variables of a program fragment or loop
   654  typedef struct Frame Frame;
   655  struct Frame {
   656  	uintptr count, elemsize, b;
   657  	const uintptr *loop_or_ret;
   658  };
   659  
   660  // Sanity check for the derived type info objti.
   661  static void
   662  checkptr(void *obj, uintptr objti)
   663  {
   664  	uintptr *pc1, type, tisize, i, j, x;
   665  	const uintptr *pc2;
   666  	byte *objstart;
   667  	Type *t;
   668  	MSpan *s;
   669  
   670  	if(!Debug)
   671  		runtime_throw("checkptr is debug only");
   672  
   673  	if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
   674  		return;
   675  	type = runtime_gettype(obj);
   676  	t = (Type*)(type & ~(uintptr)(PtrSize-1));
   677  	if(t == nil)
   678  		return;
   679  	x = (uintptr)obj >> PageShift;
   680  	x -= (uintptr)(runtime_mheap.arena_start)>>PageShift;
   681  	s = runtime_mheap.spans[x];
   682  	objstart = (byte*)((uintptr)s->start<<PageShift);
   683  	if(s->sizeclass != 0) {
   684  		i = ((byte*)obj - objstart)/s->elemsize;
   685  		objstart += i*s->elemsize;
   686  	}
   687  	tisize = *(uintptr*)objti;
   688  	// Sanity check for object size: it should fit into the memory block.
   689  	if((byte*)obj + tisize > objstart + s->elemsize) {
   690  		runtime_printf("object of type '%S' at %p/%p does not fit in block %p/%p\n",
   691  			       *t->string, obj, tisize, objstart, s->elemsize);
   692  		runtime_throw("invalid gc type info");
   693  	}
   694  	if(obj != objstart)
   695  		return;
   696  	// If obj points to the beginning of the memory block,
   697  	// check type info as well.
   698  	if(t->string == nil ||
   699  		// Gob allocates unsafe pointers for indirection.
   700  		(runtime_strcmp((const char *)t->string->str, (const char*)"unsafe.Pointer") &&
   701  		// Runtime and gc think differently about closures.
   702  		 runtime_strstr((const char *)t->string->str, (const char*)"struct { F uintptr") != (const char *)t->string->str)) {
   703  		pc1 = (uintptr*)objti;
   704  		pc2 = (const uintptr*)t->__gc;
   705  		// A simple best-effort check until first GC_END.
   706  		for(j = 1; pc1[j] != GC_END && pc2[j] != GC_END; j++) {
   707  			if(pc1[j] != pc2[j]) {
   708  				runtime_printf("invalid gc type info for '%s', type info %p [%d]=%p, block info %p [%d]=%p\n",
   709  					       t->string ? (const int8*)t->string->str : (const int8*)"?", pc1, (int32)j, pc1[j], pc2, (int32)j, pc2[j]);
   710  				runtime_throw("invalid gc type info");
   711  			}
   712  		}
   713  	}
   714  }					
   715  
   716  // scanblock scans a block of n bytes starting at pointer b for references
   717  // to other objects, scanning any it finds recursively until there are no
   718  // unscanned objects left.  Instead of using an explicit recursion, it keeps
   719  // a work list in the Workbuf* structures and loops in the main function
   720  // body.  Keeping an explicit work list is easier on the stack allocator and
   721  // more efficient.
   722  static void
   723  scanblock(Workbuf *wbuf, bool keepworking)
   724  {
   725  	byte *b, *arena_start, *arena_used;
   726  	uintptr n, i, end_b, elemsize, size, ti, objti, count, type, nobj;
   727  	uintptr precise_type, nominal_size;
   728  	const uintptr *pc, *chan_ret;
   729  	uintptr chancap;
   730  	void *obj;
   731  	const Type *t, *et;
   732  	Slice *sliceptr;
   733  	String *stringptr;
   734  	Frame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4];
   735  	BufferList *scanbuffers;
   736  	Scanbuf sbuf;
   737  	Eface *eface;
   738  	Iface *iface;
   739  	Hchan *chan;
   740  	const ChanType *chantype;
   741  	Obj *wp;
   742  
   743  	if(sizeof(Workbuf) % WorkbufSize != 0)
   744  		runtime_throw("scanblock: size of Workbuf is suboptimal");
   745  
   746  	// Memory arena parameters.
   747  	arena_start = runtime_mheap.arena_start;
   748  	arena_used = runtime_mheap.arena_used;
   749  
   750  	stack_ptr = stack+nelem(stack)-1;
   751  
   752  	precise_type = false;
   753  	nominal_size = 0;
   754  
   755  	if(wbuf) {
   756  		nobj = wbuf->nobj;
   757  		wp = &wbuf->obj[nobj];
   758  	} else {
   759  		nobj = 0;
   760  		wp = nil;
   761  	}
   762  
   763  	// Initialize sbuf
   764  	scanbuffers = &bufferList[runtime_m()->helpgc];
   765  
   766  	sbuf.ptr.begin = sbuf.ptr.pos = &scanbuffers->ptrtarget[0];
   767  	sbuf.ptr.end = sbuf.ptr.begin + nelem(scanbuffers->ptrtarget);
   768  
   769  	sbuf.obj.begin = sbuf.obj.pos = &scanbuffers->obj[0];
   770  	sbuf.obj.end = sbuf.obj.begin + nelem(scanbuffers->obj);
   771  
   772  	sbuf.wbuf = wbuf;
   773  	sbuf.wp = wp;
   774  	sbuf.nobj = nobj;
   775  
   776  	// (Silence the compiler)
   777  	chan = nil;
   778  	chantype = nil;
   779  	chan_ret = nil;
   780  
   781  	goto next_block;
   782  
   783  	for(;;) {
   784  		// Each iteration scans the block b of length n, queueing pointers in
   785  		// the work buffer.
   786  
   787  		if(CollectStats) {
   788  			runtime_xadd64(&gcstats.nbytes, n);
   789  			runtime_xadd64(&gcstats.obj.sum, sbuf.nobj);
   790  			runtime_xadd64(&gcstats.obj.cnt, 1);
   791  		}
   792  
   793  		if(ti != 0) {
   794  			if(Debug > 1) {
   795  				runtime_printf("scanblock %p %D ti %p\n", b, (int64)n, ti);
   796  			}
   797  			pc = (uintptr*)(ti & ~(uintptr)PC_BITS);
   798  			precise_type = (ti & PRECISE);
   799  			stack_top.elemsize = pc[0];
   800  			if(!precise_type)
   801  				nominal_size = pc[0];
   802  			if(ti & LOOP) {
   803  				stack_top.count = 0;	// 0 means an infinite number of iterations
   804  				stack_top.loop_or_ret = pc+1;
   805  			} else {
   806  				stack_top.count = 1;
   807  			}
   808  			if(Debug) {
   809  				// Simple sanity check for provided type info ti:
   810  				// The declared size of the object must be not larger than the actual size
   811  				// (it can be smaller due to inferior pointers).
   812  				// It's difficult to make a comprehensive check due to inferior pointers,
   813  				// reflection, gob, etc.
   814  				if(pc[0] > n) {
   815  					runtime_printf("invalid gc type info: type info size %p, block size %p\n", pc[0], n);
   816  					runtime_throw("invalid gc type info");
   817  				}
   818  			}
   819  		} else if(UseSpanType) {
   820  			if(CollectStats)
   821  				runtime_xadd64(&gcstats.obj.notype, 1);
   822  
   823  			type = runtime_gettype(b);
   824  			if(type != 0) {
   825  				if(CollectStats)
   826  					runtime_xadd64(&gcstats.obj.typelookup, 1);
   827  
   828  				t = (Type*)(type & ~(uintptr)(PtrSize-1));
   829  				switch(type & (PtrSize-1)) {
   830  				case TypeInfo_SingleObject:
   831  					pc = (const uintptr*)t->__gc;
   832  					precise_type = true;  // type information about 'b' is precise
   833  					stack_top.count = 1;
   834  					stack_top.elemsize = pc[0];
   835  					break;
   836  				case TypeInfo_Array:
   837  					pc = (const uintptr*)t->__gc;
   838  					if(pc[0] == 0)
   839  						goto next_block;
   840  					precise_type = true;  // type information about 'b' is precise
   841  					stack_top.count = 0;  // 0 means an infinite number of iterations
   842  					stack_top.elemsize = pc[0];
   843  					stack_top.loop_or_ret = pc+1;
   844  					break;
   845  				case TypeInfo_Chan:
   846  					chan = (Hchan*)b;
   847  					chantype = (const ChanType*)t;
   848  					chan_ret = nil;
   849  					pc = chanProg;
   850  					break;
   851  				default:
   852  					if(Debug > 1)
   853  						runtime_printf("scanblock %p %D type %p %S\n", b, (int64)n, type, *t->string);
   854  					runtime_throw("scanblock: invalid type");
   855  					return;
   856  				}
   857  				if(Debug > 1)
   858  					runtime_printf("scanblock %p %D type %p %S pc=%p\n", b, (int64)n, type, *t->string, pc);
   859  			} else {
   860  				pc = defaultProg;
   861  				if(Debug > 1)
   862  					runtime_printf("scanblock %p %D unknown type\n", b, (int64)n);
   863  			}
   864  		} else {
   865  			pc = defaultProg;
   866  			if(Debug > 1)
   867  				runtime_printf("scanblock %p %D no span types\n", b, (int64)n);
   868  		}
   869  
   870  		if(IgnorePreciseGC)
   871  			pc = defaultProg;
   872  
   873  		pc++;
   874  		stack_top.b = (uintptr)b;
   875  		end_b = (uintptr)b + n - PtrSize;
   876  
   877  	for(;;) {
   878  		if(CollectStats)
   879  			runtime_xadd64(&gcstats.instr[pc[0]], 1);
   880  
   881  		obj = nil;
   882  		objti = 0;
   883  		switch(pc[0]) {
   884  		case GC_PTR:
   885  			obj = *(void**)(stack_top.b + pc[1]);
   886  			objti = pc[2];
   887  			if(Debug > 2)
   888  				runtime_printf("gc_ptr @%p: %p ti=%p\n", stack_top.b+pc[1], obj, objti);
   889  			pc += 3;
   890  			if(Debug)
   891  				checkptr(obj, objti);
   892  			break;
   893  
   894  		case GC_SLICE:
   895  			sliceptr = (Slice*)(stack_top.b + pc[1]);
   896  			if(Debug > 2)
   897  				runtime_printf("gc_slice @%p: %p/%D/%D\n", sliceptr, sliceptr->array, (int64)sliceptr->__count, (int64)sliceptr->cap);
   898  			if(sliceptr->cap != 0) {
   899  				obj = sliceptr->array;
   900  				// Can't use slice element type for scanning,
   901  				// because if it points to an array embedded
   902  				// in the beginning of a struct,
   903  				// we will scan the whole struct as the slice.
   904  				// So just obtain type info from heap.
   905  			}
   906  			pc += 3;
   907  			break;
   908  
   909  		case GC_APTR:
   910  			obj = *(void**)(stack_top.b + pc[1]);
   911  			if(Debug > 2)
   912  				runtime_printf("gc_aptr @%p: %p\n", stack_top.b+pc[1], obj);
   913  			pc += 2;
   914  			break;
   915  
   916  		case GC_STRING:
   917  			stringptr = (String*)(stack_top.b + pc[1]);
   918  			if(Debug > 2)
   919  				runtime_printf("gc_string @%p: %p/%D\n", stack_top.b+pc[1], stringptr->str, (int64)stringptr->len);
   920  			if(stringptr->len != 0)
   921  				markonly(stringptr->str);
   922  			pc += 2;
   923  			continue;
   924  
   925  		case GC_EFACE:
   926  			eface = (Eface*)(stack_top.b + pc[1]);
   927  			pc += 2;
   928  			if(Debug > 2)
   929  				runtime_printf("gc_eface @%p: %p %p\n", stack_top.b+pc[1], eface->__type_descriptor, eface->__object);
   930  			if(eface->__type_descriptor == nil)
   931  				continue;
   932  
   933  			// eface->type
   934  			t = eface->__type_descriptor;
   935  			if((const byte*)t >= arena_start && (const byte*)t < arena_used) {
   936  				union { const Type *tc; Type *tr; } u;
   937  				u.tc = t;
   938  				*sbuf.ptr.pos++ = (PtrTarget){u.tr, 0};
   939  				if(sbuf.ptr.pos == sbuf.ptr.end)
   940  					flushptrbuf(&sbuf);
   941  			}
   942  
   943  			// eface->__object
   944  			if((byte*)eface->__object >= arena_start && (byte*)eface->__object < arena_used) {
   945  				if(__go_is_pointer_type(t)) {
   946  					if((t->__code & KindNoPointers))
   947  						continue;
   948  
   949  					obj = eface->__object;
   950  					if((t->__code & kindMask) == KindPtr) {
   951  						// Only use type information if it is a pointer-containing type.
   952  						// This matches the GC programs written by cmd/gc/reflect.c's
   953  						// dgcsym1 in case TPTR32/case TPTR64. See rationale there.
   954  						et = ((const PtrType*)t)->elem;
   955  						if(!(et->__code & KindNoPointers))
   956  							objti = (uintptr)((const PtrType*)t)->elem->__gc;
   957  					}
   958  				} else {
   959  					obj = eface->__object;
   960  					objti = (uintptr)t->__gc;
   961  				}
   962  			}
   963  			break;
   964  
   965  		case GC_IFACE:
   966  			iface = (Iface*)(stack_top.b + pc[1]);
   967  			pc += 2;
   968  			if(Debug > 2)
   969  				runtime_printf("gc_iface @%p: %p/%p %p\n", stack_top.b+pc[1], iface->__methods[0], nil, iface->__object);
   970  			if(iface->tab == nil)
   971  				continue;
   972  			
   973  			// iface->tab
   974  			if((byte*)iface->tab >= arena_start && (byte*)iface->tab < arena_used) {
   975  				*sbuf.ptr.pos++ = (PtrTarget){iface->tab, 0};
   976  				if(sbuf.ptr.pos == sbuf.ptr.end)
   977  					flushptrbuf(&sbuf);
   978  			}
   979  
   980  			// iface->data
   981  			if((byte*)iface->__object >= arena_start && (byte*)iface->__object < arena_used) {
   982  				t = (const Type*)iface->tab[0];
   983  				if(__go_is_pointer_type(t)) {
   984  					if((t->__code & KindNoPointers))
   985  						continue;
   986  
   987  					obj = iface->__object;
   988  					if((t->__code & kindMask) == KindPtr) {
   989  						// Only use type information if it is a pointer-containing type.
   990  						// This matches the GC programs written by cmd/gc/reflect.c's
   991  						// dgcsym1 in case TPTR32/case TPTR64. See rationale there.
   992  						et = ((const PtrType*)t)->elem;
   993  						if(!(et->__code & KindNoPointers))
   994  							objti = (uintptr)((const PtrType*)t)->elem->__gc;
   995  					}
   996  				} else {
   997  					obj = iface->__object;
   998  					objti = (uintptr)t->__gc;
   999  				}
  1000  			}
  1001  			break;
  1002  
  1003  		case GC_DEFAULT_PTR:
  1004  			while(stack_top.b <= end_b) {
  1005  				obj = *(byte**)stack_top.b;
  1006  				if(Debug > 2)
  1007  					runtime_printf("gc_default_ptr @%p: %p\n", stack_top.b, obj);
  1008  				stack_top.b += PtrSize;
  1009  				if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
  1010  					*sbuf.ptr.pos++ = (PtrTarget){obj, 0};
  1011  					if(sbuf.ptr.pos == sbuf.ptr.end)
  1012  						flushptrbuf(&sbuf);
  1013  				}
  1014  			}
  1015  			goto next_block;
  1016  
  1017  		case GC_END:
  1018  			if(--stack_top.count != 0) {
  1019  				// Next iteration of a loop if possible.
  1020  				stack_top.b += stack_top.elemsize;
  1021  				if(stack_top.b + stack_top.elemsize <= end_b+PtrSize) {
  1022  					pc = stack_top.loop_or_ret;
  1023  					continue;
  1024  				}
  1025  				i = stack_top.b;
  1026  			} else {
  1027  				// Stack pop if possible.
  1028  				if(stack_ptr+1 < stack+nelem(stack)) {
  1029  					pc = stack_top.loop_or_ret;
  1030  					stack_top = *(++stack_ptr);
  1031  					continue;
  1032  				}
  1033  				i = (uintptr)b + nominal_size;
  1034  			}
  1035  			if(!precise_type) {
  1036  				// Quickly scan [b+i,b+n) for possible pointers.
  1037  				for(; i<=end_b; i+=PtrSize) {
  1038  					if(*(byte**)i != nil) {
  1039  						// Found a value that may be a pointer.
  1040  						// Do a rescan of the entire block.
  1041  						enqueue((Obj){b, n, 0}, &sbuf.wbuf, &sbuf.wp, &sbuf.nobj);
  1042  						if(CollectStats) {
  1043  							runtime_xadd64(&gcstats.rescan, 1);
  1044  							runtime_xadd64(&gcstats.rescanbytes, n);
  1045  						}
  1046  						break;
  1047  					}
  1048  				}
  1049  			}
  1050  			goto next_block;
  1051  
  1052  		case GC_ARRAY_START:
  1053  			i = stack_top.b + pc[1];
  1054  			count = pc[2];
  1055  			elemsize = pc[3];
  1056  			pc += 4;
  1057  
  1058  			// Stack push.
  1059  			*stack_ptr-- = stack_top;
  1060  			stack_top = (Frame){count, elemsize, i, pc};
  1061  			continue;
  1062  
  1063  		case GC_ARRAY_NEXT:
  1064  			if(--stack_top.count != 0) {
  1065  				stack_top.b += stack_top.elemsize;
  1066  				pc = stack_top.loop_or_ret;
  1067  			} else {
  1068  				// Stack pop.
  1069  				stack_top = *(++stack_ptr);
  1070  				pc += 1;
  1071  			}
  1072  			continue;
  1073  
  1074  		case GC_CALL:
  1075  			// Stack push.
  1076  			*stack_ptr-- = stack_top;
  1077  			stack_top = (Frame){1, 0, stack_top.b + pc[1], pc+3 /*return address*/};
  1078  			pc = (const uintptr*)((const byte*)pc + *(const int32*)(pc+2));  // target of the CALL instruction
  1079  			continue;
  1080  
  1081  		case GC_REGION:
  1082  			obj = (void*)(stack_top.b + pc[1]);
  1083  			size = pc[2];
  1084  			objti = pc[3];
  1085  			pc += 4;
  1086  
  1087  			if(Debug > 2)
  1088  				runtime_printf("gc_region @%p: %D %p\n", stack_top.b+pc[1], (int64)size, objti);
  1089  			*sbuf.obj.pos++ = (Obj){obj, size, objti};
  1090  			if(sbuf.obj.pos == sbuf.obj.end)
  1091  				flushobjbuf(&sbuf);
  1092  			continue;
  1093  
  1094  		case GC_CHAN_PTR:
  1095  			chan = *(Hchan**)(stack_top.b + pc[1]);
  1096  			if(Debug > 2 && chan != nil)
  1097  				runtime_printf("gc_chan_ptr @%p: %p/%D/%D %p\n", stack_top.b+pc[1], chan, (int64)chan->qcount, (int64)chan->dataqsiz, pc[2]);
  1098  			if(chan == nil) {
  1099  				pc += 3;
  1100  				continue;
  1101  			}
  1102  			if(markonly(chan)) {
  1103  				chantype = (ChanType*)pc[2];
  1104  				if(!(chantype->elem->__code & KindNoPointers)) {
  1105  					// Start chanProg.
  1106  					chan_ret = pc+3;
  1107  					pc = chanProg+1;
  1108  					continue;
  1109  				}
  1110  			}
  1111  			pc += 3;
  1112  			continue;
  1113  
  1114  		case GC_CHAN:
  1115  			// There are no heap pointers in struct Hchan,
  1116  			// so we can ignore the leading sizeof(Hchan) bytes.
  1117  			if(!(chantype->elem->__code & KindNoPointers)) {
  1118  				// Channel's buffer follows Hchan immediately in memory.
  1119  				// Size of buffer (cap(c)) is second int in the chan struct.
  1120  				chancap = ((uintgo*)chan)[1];
  1121  				if(chancap > 0) {
  1122  					// TODO(atom): split into two chunks so that only the
  1123  					// in-use part of the circular buffer is scanned.
  1124  					// (Channel routines zero the unused part, so the current
  1125  					// code does not lead to leaks, it's just a little inefficient.)
  1126  					*sbuf.obj.pos++ = (Obj){(byte*)chan+runtime_Hchansize, chancap*chantype->elem->__size,
  1127  						(uintptr)chantype->elem->__gc | PRECISE | LOOP};
  1128  					if(sbuf.obj.pos == sbuf.obj.end)
  1129  						flushobjbuf(&sbuf);
  1130  				}
  1131  			}
  1132  			if(chan_ret == nil)
  1133  				goto next_block;
  1134  			pc = chan_ret;
  1135  			continue;
  1136  
  1137  		default:
  1138  			runtime_printf("runtime: invalid GC instruction %p at %p\n", pc[0], pc);
  1139  			runtime_throw("scanblock: invalid GC instruction");
  1140  			return;
  1141  		}
  1142  
  1143  		if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
  1144  			*sbuf.ptr.pos++ = (PtrTarget){obj, objti};
  1145  			if(sbuf.ptr.pos == sbuf.ptr.end)
  1146  				flushptrbuf(&sbuf);
  1147  		}
  1148  	}
  1149  
  1150  	next_block:
  1151  		// Done scanning [b, b+n).  Prepare for the next iteration of
  1152  		// the loop by setting b, n, ti to the parameters for the next block.
  1153  
  1154  		if(sbuf.nobj == 0) {
  1155  			flushptrbuf(&sbuf);
  1156  			flushobjbuf(&sbuf);
  1157  
  1158  			if(sbuf.nobj == 0) {
  1159  				if(!keepworking) {
  1160  					if(sbuf.wbuf)
  1161  						putempty(sbuf.wbuf);
  1162  					return;
  1163  				}
  1164  				// Emptied our buffer: refill.
  1165  				sbuf.wbuf = getfull(sbuf.wbuf);
  1166  				if(sbuf.wbuf == nil)
  1167  					return;
  1168  				sbuf.nobj = sbuf.wbuf->nobj;
  1169  				sbuf.wp = sbuf.wbuf->obj + sbuf.wbuf->nobj;
  1170  			}
  1171  		}
  1172  
  1173  		// Fetch b from the work buffer.
  1174  		--sbuf.wp;
  1175  		b = sbuf.wp->p;
  1176  		n = sbuf.wp->n;
  1177  		ti = sbuf.wp->ti;
  1178  		sbuf.nobj--;
  1179  	}
  1180  }
  1181  
  1182  static struct root_list* roots;
  1183  
  1184  void
  1185  __go_register_gc_roots (struct root_list* r)
  1186  {
  1187  	// FIXME: This needs locking if multiple goroutines can call
  1188  	// dlopen simultaneously.
  1189  	r->next = roots;
  1190  	roots = r;
  1191  }
  1192  
  1193  // Append obj to the work buffer.
  1194  // _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer.
  1195  static void
  1196  enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
  1197  {
  1198  	uintptr nobj, off;
  1199  	Obj *wp;
  1200  	Workbuf *wbuf;
  1201  
  1202  	if(Debug > 1)
  1203  		runtime_printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti);
  1204  
  1205  	// Align obj.b to a word boundary.
  1206  	off = (uintptr)obj.p & (PtrSize-1);
  1207  	if(off != 0) {
  1208  		obj.p += PtrSize - off;
  1209  		obj.n -= PtrSize - off;
  1210  		obj.ti = 0;
  1211  	}
  1212  
  1213  	if(obj.p == nil || obj.n == 0)
  1214  		return;
  1215  
  1216  	// Load work buffer state
  1217  	wp = *_wp;
  1218  	wbuf = *_wbuf;
  1219  	nobj = *_nobj;
  1220  
  1221  	// If another proc wants a pointer, give it some.
  1222  	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
  1223  		wbuf->nobj = nobj;
  1224  		wbuf = handoff(wbuf);
  1225  		nobj = wbuf->nobj;
  1226  		wp = wbuf->obj + nobj;
  1227  	}
  1228  
  1229  	// If buffer is full, get a new one.
  1230  	if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
  1231  		if(wbuf != nil)
  1232  			wbuf->nobj = nobj;
  1233  		wbuf = getempty(wbuf);
  1234  		wp = wbuf->obj;
  1235  		nobj = 0;
  1236  	}
  1237  
  1238  	*wp = obj;
  1239  	wp++;
  1240  	nobj++;
  1241  
  1242  	// Save work buffer state
  1243  	*_wp = wp;
  1244  	*_wbuf = wbuf;
  1245  	*_nobj = nobj;
  1246  }
  1247  
  1248  static void
  1249  enqueue1(Workbuf **wbufp, Obj obj)
  1250  {
  1251  	Workbuf *wbuf;
  1252  
  1253  	wbuf = *wbufp;
  1254  	if(wbuf->nobj >= nelem(wbuf->obj))
  1255  		*wbufp = wbuf = getempty(wbuf);
  1256  	wbuf->obj[wbuf->nobj++] = obj;
  1257  }
  1258  
  1259  static void
  1260  markroot(ParFor *desc, uint32 i)
  1261  {
  1262  	Workbuf *wbuf;
  1263  	FinBlock *fb;
  1264  	MHeap *h;
  1265  	MSpan **allspans, *s;
  1266  	uint32 spanidx, sg;
  1267  	G *gp;
  1268  	void *p;
  1269  
  1270  	USED(&desc);
  1271  	wbuf = getempty(nil);
  1272  	// Note: if you add a case here, please also update heapdump.c:dumproots.
  1273  	switch(i) {
  1274  	case RootData:
  1275  		// For gccgo this is both data and bss.
  1276  		{
  1277  			struct root_list *pl;
  1278  
  1279  			for(pl = roots; pl != nil; pl = pl->next) {
  1280  				struct root *pr = &pl->roots[0];
  1281  				while(1) {
  1282  					void *decl = pr->decl;
  1283  					if(decl == nil)
  1284  						break;
  1285  					enqueue1(&wbuf, (Obj){decl, pr->size, 0});
  1286  					pr++;
  1287  				}
  1288  			}
  1289  		}
  1290  		break;
  1291  
  1292  	case RootBss:
  1293  		// For gccgo we use this for all the other global roots.
  1294  		enqueue1(&wbuf, (Obj){(byte*)&runtime_m0, sizeof runtime_m0, 0});
  1295  		enqueue1(&wbuf, (Obj){(byte*)&runtime_g0, sizeof runtime_g0, 0});
  1296  		enqueue1(&wbuf, (Obj){(byte*)&runtime_allg, sizeof runtime_allg, 0});
  1297  		enqueue1(&wbuf, (Obj){(byte*)&runtime_allm, sizeof runtime_allm, 0});
  1298  		enqueue1(&wbuf, (Obj){(byte*)&runtime_allp, sizeof runtime_allp, 0});
  1299  		enqueue1(&wbuf, (Obj){(byte*)&work, sizeof work, 0});
  1300  		runtime_proc_scan(&wbuf, enqueue1);
  1301  		runtime_MProf_Mark(&wbuf, enqueue1);
  1302  		runtime_time_scan(&wbuf, enqueue1);
  1303  		runtime_netpoll_scan(&wbuf, enqueue1);
  1304  		break;
  1305  
  1306  	case RootFinalizers:
  1307  		for(fb=allfin; fb; fb=fb->alllink)
  1308  			enqueue1(&wbuf, (Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
  1309  		break;
  1310  
  1311  	case RootSpanTypes:
  1312  		// mark span types and MSpan.specials (to walk spans only once)
  1313  		h = &runtime_mheap;
  1314  		sg = h->sweepgen;
  1315  		allspans = h->allspans;
  1316  		for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
  1317  			Special *sp;
  1318  			SpecialFinalizer *spf;
  1319  
  1320  			s = allspans[spanidx];
  1321  			if(s->sweepgen != sg) {
  1322  				runtime_printf("sweep %d %d\n", s->sweepgen, sg);
  1323  				runtime_throw("gc: unswept span");
  1324  			}
  1325  			if(s->state != MSpanInUse)
  1326  				continue;
  1327  			// The garbage collector ignores type pointers stored in MSpan.types:
  1328  			//  - Compiler-generated types are stored outside of heap.
  1329  			//  - The reflect package has runtime-generated types cached in its data structures.
  1330  			//    The garbage collector relies on finding the references via that cache.
  1331  			if(s->types.compression == MTypes_Words || s->types.compression == MTypes_Bytes)
  1332  				markonly((byte*)s->types.data);
  1333  			for(sp = s->specials; sp != nil; sp = sp->next) {
  1334  				if(sp->kind != KindSpecialFinalizer)
  1335  					continue;
  1336  				// don't mark finalized object, but scan it so we
  1337  				// retain everything it points to.
  1338  				spf = (SpecialFinalizer*)sp;
  1339  				// A finalizer can be set for an inner byte of an object, find object beginning.
  1340  				p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize);
  1341  				enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
  1342  				enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
  1343  				enqueue1(&wbuf, (Obj){(void*)&spf->ft, PtrSize, 0});
  1344  				enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
  1345  			}
  1346  		}
  1347  		break;
  1348  
  1349  	case RootFlushCaches:
  1350  		flushallmcaches();
  1351  		break;
  1352  
  1353  	default:
  1354  		// the rest is scanning goroutine stacks
  1355  		if(i - RootCount >= runtime_allglen)
  1356  			runtime_throw("markroot: bad index");
  1357  		gp = runtime_allg[i - RootCount];
  1358  		// remember when we've first observed the G blocked
  1359  		// needed only to output in traceback
  1360  		if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince == 0)
  1361  			gp->waitsince = work.tstart;
  1362  		addstackroots(gp, &wbuf);
  1363  		break;
  1364  		
  1365  	}
  1366  
  1367  	if(wbuf)
  1368  		scanblock(wbuf, false);
  1369  }
  1370  
  1371  // Get an empty work buffer off the work.empty list,
  1372  // allocating new buffers as needed.
  1373  static Workbuf*
  1374  getempty(Workbuf *b)
  1375  {
  1376  	if(b != nil)
  1377  		runtime_lfstackpush(&work.full, &b->node);
  1378  	b = (Workbuf*)runtime_lfstackpop(&work.empty);
  1379  	if(b == nil) {
  1380  		// Need to allocate.
  1381  		runtime_lock(&work.lock);
  1382  		if(work.nchunk < sizeof *b) {
  1383  			work.nchunk = 1<<20;
  1384  			work.chunk = runtime_SysAlloc(work.nchunk, &mstats.gc_sys);
  1385  			if(work.chunk == nil)
  1386  				runtime_throw("runtime: cannot allocate memory");
  1387  		}
  1388  		b = (Workbuf*)work.chunk;
  1389  		work.chunk += sizeof *b;
  1390  		work.nchunk -= sizeof *b;
  1391  		runtime_unlock(&work.lock);
  1392  	}
  1393  	b->nobj = 0;
  1394  	return b;
  1395  }
  1396  
  1397  static void
  1398  putempty(Workbuf *b)
  1399  {
  1400  	if(CollectStats)
  1401  		runtime_xadd64(&gcstats.putempty, 1);
  1402  
  1403  	runtime_lfstackpush(&work.empty, &b->node);
  1404  }
  1405  
  1406  // Get a full work buffer off the work.full list, or return nil.
  1407  static Workbuf*
  1408  getfull(Workbuf *b)
  1409  {
  1410  	M *m;
  1411  	int32 i;
  1412  
  1413  	if(CollectStats)
  1414  		runtime_xadd64(&gcstats.getfull, 1);
  1415  
  1416  	if(b != nil)
  1417  		runtime_lfstackpush(&work.empty, &b->node);
  1418  	b = (Workbuf*)runtime_lfstackpop(&work.full);
  1419  	if(b != nil || work.nproc == 1)
  1420  		return b;
  1421  
  1422  	m = runtime_m();
  1423  	runtime_xadd(&work.nwait, +1);
  1424  	for(i=0;; i++) {
  1425  		if(work.full != 0) {
  1426  			runtime_xadd(&work.nwait, -1);
  1427  			b = (Workbuf*)runtime_lfstackpop(&work.full);
  1428  			if(b != nil)
  1429  				return b;
  1430  			runtime_xadd(&work.nwait, +1);
  1431  		}
  1432  		if(work.nwait == work.nproc)
  1433  			return nil;
  1434  		if(i < 10) {
  1435  			m->gcstats.nprocyield++;
  1436  			runtime_procyield(20);
  1437  		} else if(i < 20) {
  1438  			m->gcstats.nosyield++;
  1439  			runtime_osyield();
  1440  		} else {
  1441  			m->gcstats.nsleep++;
  1442  			runtime_usleep(100);
  1443  		}
  1444  	}
  1445  }
  1446  
  1447  static Workbuf*
  1448  handoff(Workbuf *b)
  1449  {
  1450  	M *m;
  1451  	int32 n;
  1452  	Workbuf *b1;
  1453  
  1454  	m = runtime_m();
  1455  
  1456  	// Make new buffer with half of b's pointers.
  1457  	b1 = getempty(nil);
  1458  	n = b->nobj/2;
  1459  	b->nobj -= n;
  1460  	b1->nobj = n;
  1461  	runtime_memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
  1462  	m->gcstats.nhandoff++;
  1463  	m->gcstats.nhandoffcnt += n;
  1464  
  1465  	// Put b on full list - let first half of b get stolen.
  1466  	runtime_lfstackpush(&work.full, &b->node);
  1467  	return b1;
  1468  }
  1469  
  1470  static void
  1471  addstackroots(G *gp, Workbuf **wbufp)
  1472  {
  1473  	switch(gp->status){
  1474  	default:
  1475  		runtime_printf("unexpected G.status %d (goroutine %p %D)\n", gp->status, gp, gp->goid);
  1476  		runtime_throw("mark - bad status");
  1477  	case Gdead:
  1478  		return;
  1479  	case Grunning:
  1480  		runtime_throw("mark - world not stopped");
  1481  	case Grunnable:
  1482  	case Gsyscall:
  1483  	case Gwaiting:
  1484  		break;
  1485  	}
  1486  
  1487  #ifdef USING_SPLIT_STACK
  1488  	M *mp;
  1489  	void* sp;
  1490  	size_t spsize;
  1491  	void* next_segment;
  1492  	void* next_sp;
  1493  	void* initial_sp;
  1494  
  1495  	if(gp == runtime_g()) {
  1496  		// Scanning our own stack.
  1497  		sp = __splitstack_find(nil, nil, &spsize, &next_segment,
  1498  				       &next_sp, &initial_sp);
  1499  	} else if((mp = gp->m) != nil && mp->helpgc) {
  1500  		// gchelper's stack is in active use and has no interesting pointers.
  1501  		return;
  1502  	} else {
  1503  		// Scanning another goroutine's stack.
  1504  		// The goroutine is usually asleep (the world is stopped).
  1505  
  1506  		// The exception is that if the goroutine is about to enter or might
  1507  		// have just exited a system call, it may be executing code such
  1508  		// as schedlock and may have needed to start a new stack segment.
  1509  		// Use the stack segment and stack pointer at the time of
  1510  		// the system call instead, since that won't change underfoot.
  1511  		if(gp->gcstack != nil) {
  1512  			sp = gp->gcstack;
  1513  			spsize = gp->gcstack_size;
  1514  			next_segment = gp->gcnext_segment;
  1515  			next_sp = gp->gcnext_sp;
  1516  			initial_sp = gp->gcinitial_sp;
  1517  		} else {
  1518  			sp = __splitstack_find_context(&gp->stack_context[0],
  1519  						       &spsize, &next_segment,
  1520  						       &next_sp, &initial_sp);
  1521  		}
  1522  	}
  1523  	if(sp != nil) {
  1524  		enqueue1(wbufp, (Obj){sp, spsize, 0});
  1525  		while((sp = __splitstack_find(next_segment, next_sp,
  1526  					      &spsize, &next_segment,
  1527  					      &next_sp, &initial_sp)) != nil)
  1528  			enqueue1(wbufp, (Obj){sp, spsize, 0});
  1529  	}
  1530  #else
  1531  	M *mp;
  1532  	byte* bottom;
  1533  	byte* top;
  1534  
  1535  	if(gp == runtime_g()) {
  1536  		// Scanning our own stack.
  1537  		bottom = (byte*)&gp;
  1538  	} else if((mp = gp->m) != nil && mp->helpgc) {
  1539  		// gchelper's stack is in active use and has no interesting pointers.
  1540  		return;
  1541  	} else {
  1542  		// Scanning another goroutine's stack.
  1543  		// The goroutine is usually asleep (the world is stopped).
  1544  		bottom = (byte*)gp->gcnext_sp;
  1545  		if(bottom == nil)
  1546  			return;
  1547  	}
  1548  	top = (byte*)gp->gcinitial_sp + gp->gcstack_size;
  1549  	if(top > bottom)
  1550  		enqueue1(wbufp, (Obj){bottom, top - bottom, 0});
  1551  	else
  1552  		enqueue1(wbufp, (Obj){top, bottom - top, 0});
  1553  #endif
  1554  }
  1555  
  1556  void
  1557  runtime_queuefinalizer(void *p, FuncVal *fn, const FuncType *ft, const PtrType *ot)
  1558  {
  1559  	FinBlock *block;
  1560  	Finalizer *f;
  1561  
  1562  	runtime_lock(&finlock);
  1563  	if(finq == nil || finq->cnt == finq->cap) {
  1564  		if(finc == nil) {
  1565  			finc = runtime_persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
  1566  			finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
  1567  			finc->alllink = allfin;
  1568  			allfin = finc;
  1569  		}
  1570  		block = finc;
  1571  		finc = block->next;
  1572  		block->next = finq;
  1573  		finq = block;
  1574  	}
  1575  	f = &finq->fin[finq->cnt];
  1576  	finq->cnt++;
  1577  	f->fn = fn;
  1578  	f->ft = ft;
  1579  	f->ot = ot;
  1580  	f->arg = p;
  1581  	runtime_fingwake = true;
  1582  	runtime_unlock(&finlock);
  1583  }
  1584  
  1585  void
  1586  runtime_iterate_finq(void (*callback)(FuncVal*, void*, const FuncType*, const PtrType*))
  1587  {
  1588  	FinBlock *fb;
  1589  	Finalizer *f;
  1590  	int32 i;
  1591  
  1592  	for(fb = allfin; fb; fb = fb->alllink) {
  1593  		for(i = 0; i < fb->cnt; i++) {
  1594  			f = &fb->fin[i];
  1595  			callback(f->fn, f->arg, f->ft, f->ot);
  1596  		}
  1597  	}
  1598  }
  1599  
  1600  void
  1601  runtime_MSpan_EnsureSwept(MSpan *s)
  1602  {
  1603  	M *m = runtime_m();
  1604  	G *g = runtime_g();
  1605  	uint32 sg;
  1606  
  1607  	// Caller must disable preemption.
  1608  	// Otherwise when this function returns the span can become unswept again
  1609  	// (if GC is triggered on another goroutine).
  1610  	if(m->locks == 0 && m->mallocing == 0 && g != m->g0)
  1611  		runtime_throw("MSpan_EnsureSwept: m is not locked");
  1612  
  1613  	sg = runtime_mheap.sweepgen;
  1614  	if(runtime_atomicload(&s->sweepgen) == sg)
  1615  		return;
  1616  	if(runtime_cas(&s->sweepgen, sg-2, sg-1)) {
  1617  		runtime_MSpan_Sweep(s);
  1618  		return;
  1619  	}
  1620  	// unfortunate condition, and we don't have efficient means to wait
  1621  	while(runtime_atomicload(&s->sweepgen) != sg)
  1622  		runtime_osyield();  
  1623  }
  1624  
  1625  // Sweep frees or collects finalizers for blocks not marked in the mark phase.
  1626  // It clears the mark bits in preparation for the next GC round.
  1627  // Returns true if the span was returned to heap.
  1628  bool
  1629  runtime_MSpan_Sweep(MSpan *s)
  1630  {
  1631  	M *m;
  1632  	int32 cl, n, npages, nfree;
  1633  	uintptr size, off, *bitp, shift, bits;
  1634  	uint32 sweepgen;
  1635  	byte *p;
  1636  	MCache *c;
  1637  	byte *arena_start;
  1638  	MLink head, *end;
  1639  	byte *type_data;
  1640  	byte compression;
  1641  	uintptr type_data_inc;
  1642  	MLink *x;
  1643  	Special *special, **specialp, *y;
  1644  	bool res, sweepgenset;
  1645  
  1646  	m = runtime_m();
  1647  
  1648  	// It's critical that we enter this function with preemption disabled,
  1649  	// GC must not start while we are in the middle of this function.
  1650  	if(m->locks == 0 && m->mallocing == 0 && runtime_g() != m->g0)
  1651  		runtime_throw("MSpan_Sweep: m is not locked");
  1652  	sweepgen = runtime_mheap.sweepgen;
  1653  	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
  1654  		runtime_printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
  1655  			s->state, s->sweepgen, sweepgen);
  1656  		runtime_throw("MSpan_Sweep: bad span state");
  1657  	}
  1658  	arena_start = runtime_mheap.arena_start;
  1659  	cl = s->sizeclass;
  1660  	size = s->elemsize;
  1661  	if(cl == 0) {
  1662  		n = 1;
  1663  	} else {
  1664  		// Chunk full of small blocks.
  1665  		npages = runtime_class_to_allocnpages[cl];
  1666  		n = (npages << PageShift) / size;
  1667  	}
  1668  	res = false;
  1669  	nfree = 0;
  1670  	end = &head;
  1671  	c = m->mcache;
  1672  	sweepgenset = false;
  1673  
  1674  	// mark any free objects in this span so we don't collect them
  1675  	for(x = s->freelist; x != nil; x = x->next) {
  1676  		// This is markonly(x) but faster because we don't need
  1677  		// atomic access and we're guaranteed to be pointing at
  1678  		// the head of a valid object.
  1679  		off = (uintptr*)x - (uintptr*)runtime_mheap.arena_start;
  1680  		bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
  1681  		shift = off % wordsPerBitmapWord;
  1682  		*bitp |= bitMarked<<shift;
  1683  	}
  1684  
  1685  	// Unlink & free special records for any objects we're about to free.
  1686  	specialp = &s->specials;
  1687  	special = *specialp;
  1688  	while(special != nil) {
  1689  		// A finalizer can be set for an inner byte of an object, find object beginning.
  1690  		p = (byte*)(s->start << PageShift) + special->offset/size*size;
  1691  		off = (uintptr*)p - (uintptr*)arena_start;
  1692  		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
  1693  		shift = off % wordsPerBitmapWord;
  1694  		bits = *bitp>>shift;
  1695  		if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
  1696  			// Find the exact byte for which the special was setup
  1697  			// (as opposed to object beginning).
  1698  			p = (byte*)(s->start << PageShift) + special->offset;
  1699  			// about to free object: splice out special record
  1700  			y = special;
  1701  			special = special->next;
  1702  			*specialp = special;
  1703  			if(!runtime_freespecial(y, p, size, false)) {
  1704  				// stop freeing of object if it has a finalizer
  1705  				*bitp |= bitMarked << shift;
  1706  			}
  1707  		} else {
  1708  			// object is still live: keep special record
  1709  			specialp = &special->next;
  1710  			special = *specialp;
  1711  		}
  1712  	}
  1713  
  1714  	type_data = (byte*)s->types.data;
  1715  	type_data_inc = sizeof(uintptr);
  1716  	compression = s->types.compression;
  1717  	switch(compression) {
  1718  	case MTypes_Bytes:
  1719  		type_data += 8*sizeof(uintptr);
  1720  		type_data_inc = 1;
  1721  		break;
  1722  	}
  1723  
  1724  	// Sweep through n objects of given size starting at p.
  1725  	// This thread owns the span now, so it can manipulate
  1726  	// the block bitmap without atomic operations.
  1727  	p = (byte*)(s->start << PageShift);
  1728  	for(; n > 0; n--, p += size, type_data+=type_data_inc) {
  1729  		off = (uintptr*)p - (uintptr*)arena_start;
  1730  		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
  1731  		shift = off % wordsPerBitmapWord;
  1732  		bits = *bitp>>shift;
  1733  
  1734  		if((bits & bitAllocated) == 0)
  1735  			continue;
  1736  
  1737  		if((bits & bitMarked) != 0) {
  1738  			*bitp &= ~(bitMarked<<shift);
  1739  			continue;
  1740  		}
  1741  
  1742  		if(runtime_debug.allocfreetrace)
  1743  			runtime_tracefree(p, size);
  1744  
  1745  		// Clear mark and scan bits.
  1746  		*bitp &= ~((bitScan|bitMarked)<<shift);
  1747  
  1748  		if(cl == 0) {
  1749  			// Free large span.
  1750  			runtime_unmarkspan(p, 1<<PageShift);
  1751  			s->needzero = 1;
  1752  			// important to set sweepgen before returning it to heap
  1753  			runtime_atomicstore(&s->sweepgen, sweepgen);
  1754  			sweepgenset = true;
  1755  			// See note about SysFault vs SysFree in malloc.goc.
  1756  			if(runtime_debug.efence)
  1757  				runtime_SysFault(p, size);
  1758  			else
  1759  				runtime_MHeap_Free(&runtime_mheap, s, 1);
  1760  			c->local_nlargefree++;
  1761  			c->local_largefree += size;
  1762  			runtime_xadd64(&mstats.next_gc, -(uint64)(size * (gcpercent + 100)/100));
  1763  			res = true;
  1764  		} else {
  1765  			// Free small object.
  1766  			switch(compression) {
  1767  			case MTypes_Words:
  1768  				*(uintptr*)type_data = 0;
  1769  				break;
  1770  			case MTypes_Bytes:
  1771  				*(byte*)type_data = 0;
  1772  				break;
  1773  			}
  1774  			if(size > 2*sizeof(uintptr))
  1775  				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
  1776  			else if(size > sizeof(uintptr))
  1777  				((uintptr*)p)[1] = 0;
  1778  
  1779  			end->next = (MLink*)p;
  1780  			end = (MLink*)p;
  1781  			nfree++;
  1782  		}
  1783  	}
  1784  
  1785  	// We need to set s->sweepgen = h->sweepgen only when all blocks are swept,
  1786  	// because of the potential for a concurrent free/SetFinalizer.
  1787  	// But we need to set it before we make the span available for allocation
  1788  	// (return it to heap or mcentral), because allocation code assumes that a
  1789  	// span is already swept if available for allocation.
  1790  
  1791  	if(!sweepgenset && nfree == 0) {
  1792  		// The span must be in our exclusive ownership until we update sweepgen,
  1793  		// check for potential races.
  1794  		if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
  1795  			runtime_printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
  1796  				s->state, s->sweepgen, sweepgen);
  1797  			runtime_throw("MSpan_Sweep: bad span state after sweep");
  1798  		}
  1799  		runtime_atomicstore(&s->sweepgen, sweepgen);
  1800  	}
  1801  	if(nfree > 0) {
  1802  		c->local_nsmallfree[cl] += nfree;
  1803  		c->local_cachealloc -= nfree * size;
  1804  		runtime_xadd64(&mstats.next_gc, -(uint64)(nfree * size * (gcpercent + 100)/100));
  1805  		res = runtime_MCentral_FreeSpan(&runtime_mheap.central[cl].mcentral, s, nfree, head.next, end);
  1806  		//MCentral_FreeSpan updates sweepgen
  1807  	}
  1808  	return res;
  1809  }
  1810  
  1811  // State of background sweep.
  1812  // Protected by gclock.
  1813  static struct
  1814  {
  1815  	G*	g;
  1816  	bool	parked;
  1817  
  1818  	MSpan**	spans;
  1819  	uint32	nspan;
  1820  	uint32	spanidx;
  1821  } sweep;
  1822  
  1823  // background sweeping goroutine
  1824  static void
  1825  bgsweep(void* dummy __attribute__ ((unused)))
  1826  {
  1827  	runtime_g()->issystem = 1;
  1828  	for(;;) {
  1829  		while(runtime_sweepone() != (uintptr)-1) {
  1830  			gcstats.nbgsweep++;
  1831  			runtime_gosched();
  1832  		}
  1833  		runtime_lock(&gclock);
  1834  		if(!runtime_mheap.sweepdone) {
  1835  			// It's possible if GC has happened between sweepone has
  1836  			// returned -1 and gclock lock.
  1837  			runtime_unlock(&gclock);
  1838  			continue;
  1839  		}
  1840  		sweep.parked = true;
  1841  		runtime_g()->isbackground = true;
  1842  		runtime_parkunlock(&gclock, "GC sweep wait");
  1843  		runtime_g()->isbackground = false;
  1844  	}
  1845  }
  1846  
  1847  // sweeps one span
  1848  // returns number of pages returned to heap, or -1 if there is nothing to sweep
  1849  uintptr
  1850  runtime_sweepone(void)
  1851  {
  1852  	M *m = runtime_m();
  1853  	MSpan *s;
  1854  	uint32 idx, sg;
  1855  	uintptr npages;
  1856  
  1857  	// increment locks to ensure that the goroutine is not preempted
  1858  	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
  1859  	m->locks++;
  1860  	sg = runtime_mheap.sweepgen;
  1861  	for(;;) {
  1862  		idx = runtime_xadd(&sweep.spanidx, 1) - 1;
  1863  		if(idx >= sweep.nspan) {
  1864  			runtime_mheap.sweepdone = true;
  1865  			m->locks--;
  1866  			return (uintptr)-1;
  1867  		}
  1868  		s = sweep.spans[idx];
  1869  		if(s->state != MSpanInUse) {
  1870  			s->sweepgen = sg;
  1871  			continue;
  1872  		}
  1873  		if(s->sweepgen != sg-2 || !runtime_cas(&s->sweepgen, sg-2, sg-1))
  1874  			continue;
  1875  		if(s->incache)
  1876  			runtime_throw("sweep of incache span");
  1877  		npages = s->npages;
  1878  		if(!runtime_MSpan_Sweep(s))
  1879  			npages = 0;
  1880  		m->locks--;
  1881  		return npages;
  1882  	}
  1883  }
  1884  
  1885  static void
  1886  dumpspan(uint32 idx)
  1887  {
  1888  	int32 sizeclass, n, npages, i, column;
  1889  	uintptr size;
  1890  	byte *p;
  1891  	byte *arena_start;
  1892  	MSpan *s;
  1893  	bool allocated;
  1894  
  1895  	s = runtime_mheap.allspans[idx];
  1896  	if(s->state != MSpanInUse)
  1897  		return;
  1898  	arena_start = runtime_mheap.arena_start;
  1899  	p = (byte*)(s->start << PageShift);
  1900  	sizeclass = s->sizeclass;
  1901  	size = s->elemsize;
  1902  	if(sizeclass == 0) {
  1903  		n = 1;
  1904  	} else {
  1905  		npages = runtime_class_to_allocnpages[sizeclass];
  1906  		n = (npages << PageShift) / size;
  1907  	}
  1908  	
  1909  	runtime_printf("%p .. %p:\n", p, p+n*size);
  1910  	column = 0;
  1911  	for(; n>0; n--, p+=size) {
  1912  		uintptr off, *bitp, shift, bits;
  1913  
  1914  		off = (uintptr*)p - (uintptr*)arena_start;
  1915  		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
  1916  		shift = off % wordsPerBitmapWord;
  1917  		bits = *bitp>>shift;
  1918  
  1919  		allocated = ((bits & bitAllocated) != 0);
  1920  
  1921  		for(i=0; (uint32)i<size; i+=sizeof(void*)) {
  1922  			if(column == 0) {
  1923  				runtime_printf("\t");
  1924  			}
  1925  			if(i == 0) {
  1926  				runtime_printf(allocated ? "(" : "[");
  1927  				runtime_printf("%p: ", p+i);
  1928  			} else {
  1929  				runtime_printf(" ");
  1930  			}
  1931  
  1932  			runtime_printf("%p", *(void**)(p+i));
  1933  
  1934  			if(i+sizeof(void*) >= size) {
  1935  				runtime_printf(allocated ? ") " : "] ");
  1936  			}
  1937  
  1938  			column++;
  1939  			if(column == 8) {
  1940  				runtime_printf("\n");
  1941  				column = 0;
  1942  			}
  1943  		}
  1944  	}
  1945  	runtime_printf("\n");
  1946  }
  1947  
  1948  // A debugging function to dump the contents of memory
  1949  void
  1950  runtime_memorydump(void)
  1951  {
  1952  	uint32 spanidx;
  1953  
  1954  	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
  1955  		dumpspan(spanidx);
  1956  	}
  1957  }
  1958  
  1959  void
  1960  runtime_gchelper(void)
  1961  {
  1962  	uint32 nproc;
  1963  
  1964  	runtime_m()->traceback = 2;
  1965  	gchelperstart();
  1966  
  1967  	// parallel mark for over gc roots
  1968  	runtime_parfordo(work.markfor);
  1969  
  1970  	// help other threads scan secondary blocks
  1971  	scanblock(nil, true);
  1972  
  1973  	bufferList[runtime_m()->helpgc].busy = 0;
  1974  	nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
  1975  	if(runtime_xadd(&work.ndone, +1) == nproc-1)
  1976  		runtime_notewakeup(&work.alldone);
  1977  	runtime_m()->traceback = 0;
  1978  }
  1979  
  1980  static void
  1981  cachestats(void)
  1982  {
  1983  	MCache *c;
  1984  	P *p, **pp;
  1985  
  1986  	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
  1987  		c = p->mcache;
  1988  		if(c==nil)
  1989  			continue;
  1990  		runtime_purgecachedstats(c);
  1991  	}
  1992  }
  1993  
  1994  static void
  1995  flushallmcaches(void)
  1996  {
  1997  	P *p, **pp;
  1998  	MCache *c;
  1999  
  2000  	// Flush MCache's to MCentral.
  2001  	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
  2002  		c = p->mcache;
  2003  		if(c==nil)
  2004  			continue;
  2005  		runtime_MCache_ReleaseAll(c);
  2006  	}
  2007  }
  2008  
  2009  void
  2010  runtime_updatememstats(GCStats *stats)
  2011  {
  2012  	M *mp;
  2013  	MSpan *s;
  2014  	uint32 i;
  2015  	uint64 stacks_inuse, smallfree;
  2016  	uint64 *src, *dst;
  2017  
  2018  	if(stats)
  2019  		runtime_memclr((byte*)stats, sizeof(*stats));
  2020  	stacks_inuse = 0;
  2021  	for(mp=runtime_allm; mp; mp=mp->alllink) {
  2022  		//stacks_inuse += mp->stackinuse*FixedStack;
  2023  		if(stats) {
  2024  			src = (uint64*)&mp->gcstats;
  2025  			dst = (uint64*)stats;
  2026  			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
  2027  				dst[i] += src[i];
  2028  			runtime_memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
  2029  		}
  2030  	}
  2031  	mstats.stacks_inuse = stacks_inuse;
  2032  	mstats.mcache_inuse = runtime_mheap.cachealloc.inuse;
  2033  	mstats.mspan_inuse = runtime_mheap.spanalloc.inuse;
  2034  	mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
  2035  		mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
  2036  	
  2037  	// Calculate memory allocator stats.
  2038  	// During program execution we only count number of frees and amount of freed memory.
  2039  	// Current number of alive object in the heap and amount of alive heap memory
  2040  	// are calculated by scanning all spans.
  2041  	// Total number of mallocs is calculated as number of frees plus number of alive objects.
  2042  	// Similarly, total amount of allocated memory is calculated as amount of freed memory
  2043  	// plus amount of alive heap memory.
  2044  	mstats.alloc = 0;
  2045  	mstats.total_alloc = 0;
  2046  	mstats.nmalloc = 0;
  2047  	mstats.nfree = 0;
  2048  	for(i = 0; i < nelem(mstats.by_size); i++) {
  2049  		mstats.by_size[i].nmalloc = 0;
  2050  		mstats.by_size[i].nfree = 0;
  2051  	}
  2052  
  2053  	// Flush MCache's to MCentral.
  2054  	flushallmcaches();
  2055  
  2056  	// Aggregate local stats.
  2057  	cachestats();
  2058  
  2059  	// Scan all spans and count number of alive objects.
  2060  	for(i = 0; i < runtime_mheap.nspan; i++) {
  2061  		s = runtime_mheap.allspans[i];
  2062  		if(s->state != MSpanInUse)
  2063  			continue;
  2064  		if(s->sizeclass == 0) {
  2065  			mstats.nmalloc++;
  2066  			mstats.alloc += s->elemsize;
  2067  		} else {
  2068  			mstats.nmalloc += s->ref;
  2069  			mstats.by_size[s->sizeclass].nmalloc += s->ref;
  2070  			mstats.alloc += s->ref*s->elemsize;
  2071  		}
  2072  	}
  2073  
  2074  	// Aggregate by size class.
  2075  	smallfree = 0;
  2076  	mstats.nfree = runtime_mheap.nlargefree;
  2077  	for(i = 0; i < nelem(mstats.by_size); i++) {
  2078  		mstats.nfree += runtime_mheap.nsmallfree[i];
  2079  		mstats.by_size[i].nfree = runtime_mheap.nsmallfree[i];
  2080  		mstats.by_size[i].nmalloc += runtime_mheap.nsmallfree[i];
  2081  		smallfree += runtime_mheap.nsmallfree[i] * runtime_class_to_size[i];
  2082  	}
  2083  	mstats.nmalloc += mstats.nfree;
  2084  
  2085  	// Calculate derived stats.
  2086  	mstats.total_alloc = mstats.alloc + runtime_mheap.largefree + smallfree;
  2087  	mstats.heap_alloc = mstats.alloc;
  2088  	mstats.heap_objects = mstats.nmalloc - mstats.nfree;
  2089  }
  2090  
  2091  // Structure of arguments passed to function gc().
  2092  // This allows the arguments to be passed via runtime_mcall.
  2093  struct gc_args
  2094  {
  2095  	int64 start_time; // start time of GC in ns (just before stoptheworld)
  2096  	bool  eagersweep;
  2097  };
  2098  
  2099  static void gc(struct gc_args *args);
  2100  static void mgc(G *gp);
  2101  
  2102  static int32
  2103  readgogc(void)
  2104  {
  2105  	const byte *p;
  2106  
  2107  	p = runtime_getenv("GOGC");
  2108  	if(p == nil || p[0] == '\0')
  2109  		return 100;
  2110  	if(runtime_strcmp((const char *)p, "off") == 0)
  2111  		return -1;
  2112  	return runtime_atoi(p);
  2113  }
  2114  
  2115  // force = 1 - do GC regardless of current heap usage
  2116  // force = 2 - go GC and eager sweep
  2117  void
  2118  runtime_gc(int32 force)
  2119  {
  2120  	M *m;
  2121  	G *g;
  2122  	struct gc_args a;
  2123  	int32 i;
  2124  
  2125  	// The atomic operations are not atomic if the uint64s
  2126  	// are not aligned on uint64 boundaries. This has been
  2127  	// a problem in the past.
  2128  	if((((uintptr)&work.empty) & 7) != 0)
  2129  		runtime_throw("runtime: gc work buffer is misaligned");
  2130  	if((((uintptr)&work.full) & 7) != 0)
  2131  		runtime_throw("runtime: gc work buffer is misaligned");
  2132  
  2133  	// Make sure all registers are saved on stack so that
  2134  	// scanstack sees them.
  2135  	__builtin_unwind_init();
  2136  
  2137  	// The gc is turned off (via enablegc) until
  2138  	// the bootstrap has completed.
  2139  	// Also, malloc gets called in the guts
  2140  	// of a number of libraries that might be
  2141  	// holding locks.  To avoid priority inversion
  2142  	// problems, don't bother trying to run gc
  2143  	// while holding a lock.  The next mallocgc
  2144  	// without a lock will do the gc instead.
  2145  	m = runtime_m();
  2146  	if(!mstats.enablegc || runtime_g() == m->g0 || m->locks > 0 || runtime_panicking)
  2147  		return;
  2148  
  2149  	if(gcpercent == GcpercentUnknown) {	// first time through
  2150  		runtime_lock(&runtime_mheap.lock);
  2151  		if(gcpercent == GcpercentUnknown)
  2152  			gcpercent = readgogc();
  2153  		runtime_unlock(&runtime_mheap.lock);
  2154  	}
  2155  	if(gcpercent < 0)
  2156  		return;
  2157  
  2158  	runtime_semacquire(&runtime_worldsema, false);
  2159  	if(force==0 && mstats.heap_alloc < mstats.next_gc) {
  2160  		// typically threads which lost the race to grab
  2161  		// worldsema exit here when gc is done.
  2162  		runtime_semrelease(&runtime_worldsema);
  2163  		return;
  2164  	}
  2165  
  2166  	// Ok, we're doing it!  Stop everybody else
  2167  	a.start_time = runtime_nanotime();
  2168  	a.eagersweep = force >= 2;
  2169  	m->gcing = 1;
  2170  	runtime_stoptheworld();
  2171  	
  2172  	clearpools();
  2173  
  2174  	// Run gc on the g0 stack.  We do this so that the g stack
  2175  	// we're currently running on will no longer change.  Cuts
  2176  	// the root set down a bit (g0 stacks are not scanned, and
  2177  	// we don't need to scan gc's internal state).  Also an
  2178  	// enabler for copyable stacks.
  2179  	for(i = 0; i < (runtime_debug.gctrace > 1 ? 2 : 1); i++) {
  2180  		if(i > 0)
  2181  			a.start_time = runtime_nanotime();
  2182  		// switch to g0, call gc(&a), then switch back
  2183  		g = runtime_g();
  2184  		g->param = &a;
  2185  		g->status = Gwaiting;
  2186  		g->waitreason = "garbage collection";
  2187  		runtime_mcall(mgc);
  2188  		m = runtime_m();
  2189  	}
  2190  
  2191  	// all done
  2192  	m->gcing = 0;
  2193  	m->locks++;
  2194  	runtime_semrelease(&runtime_worldsema);
  2195  	runtime_starttheworld();
  2196  	m->locks--;
  2197  
  2198  	// now that gc is done, kick off finalizer thread if needed
  2199  	if(!ConcurrentSweep) {
  2200  		// give the queued finalizers, if any, a chance to run
  2201  		runtime_gosched();
  2202  	} else {
  2203  		// For gccgo, let other goroutines run.
  2204  		runtime_gosched();
  2205  	}
  2206  }
  2207  
  2208  static void
  2209  mgc(G *gp)
  2210  {
  2211  	gc(gp->param);
  2212  	gp->param = nil;
  2213  	gp->status = Grunning;
  2214  	runtime_gogo(gp);
  2215  }
  2216  
  2217  static void
  2218  gc(struct gc_args *args)
  2219  {
  2220  	M *m;
  2221  	int64 t0, t1, t2, t3, t4;
  2222  	uint64 heap0, heap1, obj, ninstr;
  2223  	GCStats stats;
  2224  	uint32 i;
  2225  	// Eface eface;
  2226  
  2227  	m = runtime_m();
  2228  
  2229  	if(runtime_debug.allocfreetrace)
  2230  		runtime_tracegc();
  2231  
  2232  	m->traceback = 2;
  2233  	t0 = args->start_time;
  2234  	work.tstart = args->start_time; 
  2235  
  2236  	if(CollectStats)
  2237  		runtime_memclr((byte*)&gcstats, sizeof(gcstats));
  2238  
  2239  	m->locks++;	// disable gc during mallocs in parforalloc
  2240  	if(work.markfor == nil)
  2241  		work.markfor = runtime_parforalloc(MaxGcproc);
  2242  	m->locks--;
  2243  
  2244  	t1 = 0;
  2245  	if(runtime_debug.gctrace)
  2246  		t1 = runtime_nanotime();
  2247  
  2248  	// Sweep what is not sweeped by bgsweep.
  2249  	while(runtime_sweepone() != (uintptr)-1)
  2250  		gcstats.npausesweep++;
  2251  
  2252  	work.nwait = 0;
  2253  	work.ndone = 0;
  2254  	work.nproc = runtime_gcprocs();
  2255  	runtime_parforsetup(work.markfor, work.nproc, RootCount + runtime_allglen, nil, false, markroot);
  2256  	if(work.nproc > 1) {
  2257  		runtime_noteclear(&work.alldone);
  2258  		runtime_helpgc(work.nproc);
  2259  	}
  2260  
  2261  	t2 = 0;
  2262  	if(runtime_debug.gctrace)
  2263  		t2 = runtime_nanotime();
  2264  
  2265  	gchelperstart();
  2266  	runtime_parfordo(work.markfor);
  2267  	scanblock(nil, true);
  2268  
  2269  	t3 = 0;
  2270  	if(runtime_debug.gctrace)
  2271  		t3 = runtime_nanotime();
  2272  
  2273  	bufferList[m->helpgc].busy = 0;
  2274  	if(work.nproc > 1)
  2275  		runtime_notesleep(&work.alldone);
  2276  
  2277  	cachestats();
  2278  	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
  2279  	// estimate what was live heap size after previous GC (for tracing only)
  2280  	heap0 = mstats.next_gc*100/(gcpercent+100);
  2281  	// conservatively set next_gc to high value assuming that everything is live
  2282  	// concurrent/lazy sweep will reduce this number while discovering new garbage
  2283  	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
  2284  
  2285  	t4 = runtime_nanotime();
  2286  	mstats.last_gc = runtime_unixnanotime();  // must be Unix time to make sense to user
  2287  	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
  2288  	mstats.pause_total_ns += t4 - t0;
  2289  	mstats.numgc++;
  2290  	if(mstats.debuggc)
  2291  		runtime_printf("pause %D\n", t4-t0);
  2292  
  2293  	if(runtime_debug.gctrace) {
  2294  		heap1 = mstats.heap_alloc;
  2295  		runtime_updatememstats(&stats);
  2296  		if(heap1 != mstats.heap_alloc) {
  2297  			runtime_printf("runtime: mstats skew: heap=%D/%D\n", heap1, mstats.heap_alloc);
  2298  			runtime_throw("mstats skew");
  2299  		}
  2300  		obj = mstats.nmalloc - mstats.nfree;
  2301  
  2302  		stats.nprocyield += work.markfor->nprocyield;
  2303  		stats.nosyield += work.markfor->nosyield;
  2304  		stats.nsleep += work.markfor->nsleep;
  2305  
  2306  		runtime_printf("gc%d(%d): %D+%D+%D+%D us, %D -> %D MB, %D (%D-%D) objects,"
  2307  				" %d/%d/%d sweeps,"
  2308  				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
  2309  			mstats.numgc, work.nproc, (t1-t0)/1000, (t2-t1)/1000, (t3-t2)/1000, (t4-t3)/1000,
  2310  			heap0>>20, heap1>>20, obj,
  2311  			mstats.nmalloc, mstats.nfree,
  2312  			sweep.nspan, gcstats.nbgsweep, gcstats.npausesweep,
  2313  			stats.nhandoff, stats.nhandoffcnt,
  2314  			work.markfor->nsteal, work.markfor->nstealcnt,
  2315  			stats.nprocyield, stats.nosyield, stats.nsleep);
  2316  		gcstats.nbgsweep = gcstats.npausesweep = 0;
  2317  		if(CollectStats) {
  2318  			runtime_printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
  2319  				gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
  2320  			if(gcstats.ptr.cnt != 0)
  2321  				runtime_printf("avg ptrbufsize: %D (%D/%D)\n",
  2322  					gcstats.ptr.sum/gcstats.ptr.cnt, gcstats.ptr.sum, gcstats.ptr.cnt);
  2323  			if(gcstats.obj.cnt != 0)
  2324  				runtime_printf("avg nobj: %D (%D/%D)\n",
  2325  					gcstats.obj.sum/gcstats.obj.cnt, gcstats.obj.sum, gcstats.obj.cnt);
  2326  			runtime_printf("rescans: %D, %D bytes\n", gcstats.rescan, gcstats.rescanbytes);
  2327  
  2328  			runtime_printf("instruction counts:\n");
  2329  			ninstr = 0;
  2330  			for(i=0; i<nelem(gcstats.instr); i++) {
  2331  				runtime_printf("\t%d:\t%D\n", i, gcstats.instr[i]);
  2332  				ninstr += gcstats.instr[i];
  2333  			}
  2334  			runtime_printf("\ttotal:\t%D\n", ninstr);
  2335  
  2336  			runtime_printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);
  2337  
  2338  			runtime_printf("markonly base lookup: bit %D word %D span %D\n", gcstats.markonly.foundbit, gcstats.markonly.foundword, gcstats.markonly.foundspan);
  2339  			runtime_printf("flushptrbuf base lookup: bit %D word %D span %D\n", gcstats.flushptrbuf.foundbit, gcstats.flushptrbuf.foundword, gcstats.flushptrbuf.foundspan);
  2340  		}
  2341  	}
  2342  
  2343  	// We cache current runtime_mheap.allspans array in sweep.spans,
  2344  	// because the former can be resized and freed.
  2345  	// Otherwise we would need to take heap lock every time
  2346  	// we want to convert span index to span pointer.
  2347  
  2348  	// Free the old cached array if necessary.
  2349  	if(sweep.spans && sweep.spans != runtime_mheap.allspans)
  2350  		runtime_SysFree(sweep.spans, sweep.nspan*sizeof(sweep.spans[0]), &mstats.other_sys);
  2351  	// Cache the current array.
  2352  	runtime_mheap.sweepspans = runtime_mheap.allspans;
  2353  	runtime_mheap.sweepgen += 2;
  2354  	runtime_mheap.sweepdone = false;
  2355  	sweep.spans = runtime_mheap.allspans;
  2356  	sweep.nspan = runtime_mheap.nspan;
  2357  	sweep.spanidx = 0;
  2358  
  2359  	// Temporary disable concurrent sweep, because we see failures on builders.
  2360  	if(ConcurrentSweep && !args->eagersweep) {
  2361  		runtime_lock(&gclock);
  2362  		if(sweep.g == nil)
  2363  			sweep.g = __go_go(bgsweep, nil);
  2364  		else if(sweep.parked) {
  2365  			sweep.parked = false;
  2366  			runtime_ready(sweep.g);
  2367  		}
  2368  		runtime_unlock(&gclock);
  2369  	} else {
  2370  		// Sweep all spans eagerly.
  2371  		while(runtime_sweepone() != (uintptr)-1)
  2372  			gcstats.npausesweep++;
  2373  		// Do an additional mProf_GC, because all 'free' events are now real as well.
  2374  		runtime_MProf_GC();
  2375  	}
  2376  
  2377  	runtime_MProf_GC();
  2378  	m->traceback = 0;
  2379  }
  2380  
  2381  extern uintptr runtime_sizeof_C_MStats
  2382    __asm__ (GOSYM_PREFIX "runtime.Sizeof_C_MStats");
  2383  
  2384  void runtime_ReadMemStats(MStats *)
  2385    __asm__ (GOSYM_PREFIX "runtime.ReadMemStats");
  2386  
  2387  void
  2388  runtime_ReadMemStats(MStats *stats)
  2389  {
  2390  	M *m;
  2391  
  2392  	// Have to acquire worldsema to stop the world,
  2393  	// because stoptheworld can only be used by
  2394  	// one goroutine at a time, and there might be
  2395  	// a pending garbage collection already calling it.
  2396  	runtime_semacquire(&runtime_worldsema, false);
  2397  	m = runtime_m();
  2398  	m->gcing = 1;
  2399  	runtime_stoptheworld();
  2400  	runtime_updatememstats(nil);
  2401  	// Size of the trailing by_size array differs between Go and C,
  2402  	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
  2403  	runtime_memmove(stats, &mstats, runtime_sizeof_C_MStats);
  2404  	m->gcing = 0;
  2405  	m->locks++;
  2406  	runtime_semrelease(&runtime_worldsema);
  2407  	runtime_starttheworld();
  2408  	m->locks--;
  2409  }
  2410  
  2411  void runtime_debug_readGCStats(Slice*)
  2412    __asm__("runtime_debug.readGCStats");
  2413  
  2414  void
  2415  runtime_debug_readGCStats(Slice *pauses)
  2416  {
  2417  	uint64 *p;
  2418  	uint32 i, n;
  2419  
  2420  	// Calling code in runtime/debug should make the slice large enough.
  2421  	if((size_t)pauses->cap < nelem(mstats.pause_ns)+3)
  2422  		runtime_throw("runtime: short slice passed to readGCStats");
  2423  
  2424  	// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
  2425  	p = (uint64*)pauses->array;
  2426  	runtime_lock(&runtime_mheap.lock);
  2427  	n = mstats.numgc;
  2428  	if(n > nelem(mstats.pause_ns))
  2429  		n = nelem(mstats.pause_ns);
  2430  	
  2431  	// The pause buffer is circular. The most recent pause is at
  2432  	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
  2433  	// from there to go back farther in time. We deliver the times
  2434  	// most recent first (in p[0]).
  2435  	for(i=0; i<n; i++)
  2436  		p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];
  2437  
  2438  	p[n] = mstats.last_gc;
  2439  	p[n+1] = mstats.numgc;
  2440  	p[n+2] = mstats.pause_total_ns;	
  2441  	runtime_unlock(&runtime_mheap.lock);
  2442  	pauses->__count = n+3;
  2443  }
  2444  
  2445  int32
  2446  runtime_setgcpercent(int32 in) {
  2447  	int32 out;
  2448  
  2449  	runtime_lock(&runtime_mheap.lock);
  2450  	if(gcpercent == GcpercentUnknown)
  2451  		gcpercent = readgogc();
  2452  	out = gcpercent;
  2453  	if(in < 0)
  2454  		in = -1;
  2455  	gcpercent = in;
  2456  	runtime_unlock(&runtime_mheap.lock);
  2457  	return out;
  2458  }
  2459  
  2460  static void
  2461  gchelperstart(void)
  2462  {
  2463  	M *m;
  2464  
  2465  	m = runtime_m();
  2466  	if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
  2467  		runtime_throw("gchelperstart: bad m->helpgc");
  2468  	if(runtime_xchg(&bufferList[m->helpgc].busy, 1))
  2469  		runtime_throw("gchelperstart: already busy");
  2470  	if(runtime_g() != m->g0)
  2471  		runtime_throw("gchelper not running on g0 stack");
  2472  }
  2473  
  2474  static void
  2475  runfinq(void* dummy __attribute__ ((unused)))
  2476  {
  2477  	Finalizer *f;
  2478  	FinBlock *fb, *next;
  2479  	uint32 i;
  2480  	Eface ef;
  2481  	Iface iface;
  2482  
  2483  	// This function blocks for long periods of time, and because it is written in C
  2484  	// we have no liveness information. Zero everything so that uninitialized pointers
  2485  	// do not cause memory leaks.
  2486  	f = nil;
  2487  	fb = nil;
  2488  	next = nil;
  2489  	i = 0;
  2490  	ef.__type_descriptor = nil;
  2491  	ef.__object = nil;
  2492  	
  2493  	// force flush to memory
  2494  	USED(&f);
  2495  	USED(&fb);
  2496  	USED(&next);
  2497  	USED(&i);
  2498  	USED(&ef);
  2499  
  2500  	for(;;) {
  2501  		runtime_lock(&finlock);
  2502  		fb = finq;
  2503  		finq = nil;
  2504  		if(fb == nil) {
  2505  			runtime_fingwait = true;
  2506  			runtime_g()->isbackground = true;
  2507  			runtime_parkunlock(&finlock, "finalizer wait");
  2508  			runtime_g()->isbackground = false;
  2509  			continue;
  2510  		}
  2511  		runtime_unlock(&finlock);
  2512  		for(; fb; fb=next) {
  2513  			next = fb->next;
  2514  			for(i=0; i<(uint32)fb->cnt; i++) {
  2515  				const Type *fint;
  2516  				void *param;
  2517  
  2518  				f = &fb->fin[i];
  2519  				fint = ((const Type**)f->ft->__in.array)[0];
  2520  				if((fint->__code & kindMask) == KindPtr) {
  2521  					// direct use of pointer
  2522  					param = &f->arg;
  2523  				} else if(((const InterfaceType*)fint)->__methods.__count == 0) {
  2524  					// convert to empty interface
  2525  					ef.__type_descriptor = (const Type*)f->ot;
  2526  					ef.__object = f->arg;
  2527  					param = &ef;
  2528  				} else {
  2529  					// convert to interface with methods
  2530  					iface.__methods = __go_convert_interface_2((const Type*)fint,
  2531  										   (const Type*)f->ot,
  2532  										   1);
  2533  					iface.__object = f->arg;
  2534  					if(iface.__methods == nil)
  2535  						runtime_throw("invalid type conversion in runfinq");
  2536  					param = &iface;
  2537  				}
  2538  				reflect_call(f->ft, f->fn, 0, 0, &param, nil);
  2539  				f->fn = nil;
  2540  				f->arg = nil;
  2541  				f->ot = nil;
  2542  			}
  2543  			fb->cnt = 0;
  2544  			runtime_lock(&finlock);
  2545  			fb->next = finc;
  2546  			finc = fb;
  2547  			runtime_unlock(&finlock);
  2548  		}
  2549  
  2550  		// Zero everything that's dead, to avoid memory leaks.
  2551  		// See comment at top of function.
  2552  		f = nil;
  2553  		fb = nil;
  2554  		next = nil;
  2555  		i = 0;
  2556  		ef.__type_descriptor = nil;
  2557  		ef.__object = nil;
  2558  		runtime_gc(1);	// trigger another gc to clean up the finalized objects, if possible
  2559  	}
  2560  }
  2561  
  2562  void
  2563  runtime_createfing(void)
  2564  {
  2565  	if(fing != nil)
  2566  		return;
  2567  	// Here we use gclock instead of finlock,
  2568  	// because newproc1 can allocate, which can cause on-demand span sweep,
  2569  	// which can queue finalizers, which would deadlock.
  2570  	runtime_lock(&gclock);
  2571  	if(fing == nil)
  2572  		fing = __go_go(runfinq, nil);
  2573  	runtime_unlock(&gclock);
  2574  }
  2575  
  2576  G*
  2577  runtime_wakefing(void)
  2578  {
  2579  	G *res;
  2580  
  2581  	res = nil;
  2582  	runtime_lock(&finlock);
  2583  	if(runtime_fingwait && runtime_fingwake) {
  2584  		runtime_fingwait = false;
  2585  		runtime_fingwake = false;
  2586  		res = fing;
  2587  	}
  2588  	runtime_unlock(&finlock);
  2589  	return res;
  2590  }
  2591  
  2592  void
  2593  runtime_marknogc(void *v)
  2594  {
  2595  	uintptr *b, off, shift;
  2596  
  2597  	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
  2598  	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
  2599  	shift = off % wordsPerBitmapWord;
  2600  	*b = (*b & ~(bitAllocated<<shift)) | bitBlockBoundary<<shift;
  2601  }
  2602  
  2603  void
  2604  runtime_markscan(void *v)
  2605  {
  2606  	uintptr *b, off, shift;
  2607  
  2608  	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
  2609  	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
  2610  	shift = off % wordsPerBitmapWord;
  2611  	*b |= bitScan<<shift;
  2612  }
  2613  
  2614  // mark the block at v as freed.
  2615  void
  2616  runtime_markfreed(void *v)
  2617  {
  2618  	uintptr *b, off, shift;
  2619  
  2620  	if(0)
  2621  		runtime_printf("markfreed %p\n", v);
  2622  
  2623  	if((byte*)v > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
  2624  		runtime_throw("markfreed: bad pointer");
  2625  
  2626  	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
  2627  	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
  2628  	shift = off % wordsPerBitmapWord;
  2629  	*b = (*b & ~(bitMask<<shift)) | (bitAllocated<<shift);
  2630  }
  2631  
  2632  // check that the block at v of size n is marked freed.
  2633  void
  2634  runtime_checkfreed(void *v, uintptr n)
  2635  {
  2636  	uintptr *b, bits, off, shift;
  2637  
  2638  	if(!runtime_checking)
  2639  		return;
  2640  
  2641  	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
  2642  		return;	// not allocated, so okay
  2643  
  2644  	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
  2645  	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
  2646  	shift = off % wordsPerBitmapWord;
  2647  
  2648  	bits = *b>>shift;
  2649  	if((bits & bitAllocated) != 0) {
  2650  		runtime_printf("checkfreed %p+%p: off=%p have=%p\n",
  2651  			v, n, off, bits & bitMask);
  2652  		runtime_throw("checkfreed: not freed");
  2653  	}
  2654  }
  2655  
  2656  // mark the span of memory at v as having n blocks of the given size.
  2657  // if leftover is true, there is left over space at the end of the span.
  2658  void
  2659  runtime_markspan(void *v, uintptr size, uintptr n, bool leftover)
  2660  {
  2661  	uintptr *b, *b0, off, shift, i, x;
  2662  	byte *p;
  2663  
  2664  	if((byte*)v+size*n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
  2665  		runtime_throw("markspan: bad pointer");
  2666  
  2667  	if(runtime_checking) {
  2668  		// bits should be all zero at the start
  2669  		off = (byte*)v + size - runtime_mheap.arena_start;
  2670  		b = (uintptr*)(runtime_mheap.arena_start - off/wordsPerBitmapWord);
  2671  		for(i = 0; i < size/PtrSize/wordsPerBitmapWord; i++) {
  2672  			if(b[i] != 0)
  2673  				runtime_throw("markspan: span bits not zero");
  2674  		}
  2675  	}
  2676  
  2677  	p = v;
  2678  	if(leftover)	// mark a boundary just past end of last block too
  2679  		n++;
  2680  
  2681  	b0 = nil;
  2682  	x = 0;
  2683  	for(; n-- > 0; p += size) {
  2684  		// Okay to use non-atomic ops here, because we control
  2685  		// the entire span, and each bitmap word has bits for only
  2686  		// one span, so no other goroutines are changing these
  2687  		// bitmap words.
  2688  		off = (uintptr*)p - (uintptr*)runtime_mheap.arena_start;  // word offset
  2689  		b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
  2690  		shift = off % wordsPerBitmapWord;
  2691  		if(b0 != b) {
  2692  			if(b0 != nil)
  2693  				*b0 = x;
  2694  			b0 = b;
  2695  			x = 0;
  2696  		}
  2697  		x |= bitAllocated<<shift;
  2698  	}
  2699  	*b0 = x;
  2700  }
  2701  
  2702  // unmark the span of memory at v of length n bytes.
  2703  void
  2704  runtime_unmarkspan(void *v, uintptr n)
  2705  {
  2706  	uintptr *p, *b, off;
  2707  
  2708  	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
  2709  		runtime_throw("markspan: bad pointer");
  2710  
  2711  	p = v;
  2712  	off = p - (uintptr*)runtime_mheap.arena_start;  // word offset
  2713  	if(off % wordsPerBitmapWord != 0)
  2714  		runtime_throw("markspan: unaligned pointer");
  2715  	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
  2716  	n /= PtrSize;
  2717  	if(n%wordsPerBitmapWord != 0)
  2718  		runtime_throw("unmarkspan: unaligned length");
  2719  	// Okay to use non-atomic ops here, because we control
  2720  	// the entire span, and each bitmap word has bits for only
  2721  	// one span, so no other goroutines are changing these
  2722  	// bitmap words.
  2723  	n /= wordsPerBitmapWord;
  2724  	while(n-- > 0)
  2725  		*b-- = 0;
  2726  }
  2727  
  2728  void
  2729  runtime_MHeap_MapBits(MHeap *h)
  2730  {
  2731  	size_t page_size;
  2732  
  2733  	// Caller has added extra mappings to the arena.
  2734  	// Add extra mappings of bitmap words as needed.
  2735  	// We allocate extra bitmap pieces in chunks of bitmapChunk.
  2736  	enum {
  2737  		bitmapChunk = 8192
  2738  	};
  2739  	uintptr n;
  2740  
  2741  	n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
  2742  	n = ROUND(n, bitmapChunk);
  2743  	n = ROUND(n, PageSize);
  2744  	page_size = getpagesize();
  2745  	n = ROUND(n, page_size);
  2746  	if(h->bitmap_mapped >= n)
  2747  		return;
  2748  
  2749  	runtime_SysMap(h->arena_start - n, n - h->bitmap_mapped, h->arena_reserved, &mstats.gc_sys);
  2750  	h->bitmap_mapped = n;
  2751  }