github.com/4ad/go@v0.0.0-20161219182952-69a12818b605/src/runtime/mbitmap.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Garbage collector: type and heap bitmaps.
     6  //
     7  // Stack, data, and bss bitmaps
     8  //
     9  // Stack frames and global variables in the data and bss sections are described
    10  // by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
    11  // to be visited during GC. The bits in each byte are consumed starting with
    12  // the low bit: 1<<0, 1<<1, and so on.
    13  //
    14  // Heap bitmap
    15  //
    16  // The allocated heap comes from a subset of the memory in the range [start, used),
    17  // where start == mheap_.arena_start and used == mheap_.arena_used.
    18  // The heap bitmap comprises 2 bits for each pointer-sized word in that range,
    19  // stored in bytes indexed backward in memory from start.
    20  // That is, the byte at address start-1 holds the 2-bit entries for the four words
    21  // start through start+3*ptrSize, the byte at start-2 holds the entries for
    22  // start+4*ptrSize through start+7*ptrSize, and so on.
    23  //
    24  // In each 2-bit entry, the lower bit holds the same information as in the 1-bit
    25  // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
    26  // The meaning of the high bit depends on the position of the word being described
    27  // in its allocated object. In all words *except* the second word, the
    28  // high bit indicates that the object is still being described. In
    29  // these words, if a bit pair with a high bit 0 is encountered, the
    30  // low bit can also be assumed to be 0, and the object description is
    31  // over. This 00 is called the ``dead'' encoding: it signals that the
    32  // rest of the words in the object are uninteresting to the garbage
    33  // collector.
    34  //
    35  // In the second word, the high bit is the GC ``checkmarked'' bit (see below).
    36  //
    37  // The 2-bit entries are split when written into the byte, so that the top half
    38  // of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
    39  // bits.
    40  // This form allows a copy from the 1-bit to the 4-bit form to keep the
    41  // pointer bits contiguous, instead of having to space them out.
    42  //
    43  // The code makes use of the fact that the zero value for a heap bitmap
    44  // has no live pointer bit set and is (depending on position), not used,
    45  // not checkmarked, and is the dead encoding.
    46  // These properties must be preserved when modifying the encoding.
    47  //
    48  // Checkmarks
    49  //
    50  // In a concurrent garbage collector, one worries about failing to mark
    51  // a live object due to mutations without write barriers or bugs in the
    52  // collector implementation. As a sanity check, the GC has a 'checkmark'
    53  // mode that retraverses the object graph with the world stopped, to make
    54  // sure that everything that should be marked is marked.
    55  // In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
    56  // for the second word of the object holds the checkmark bit.
    57  // When not in checkmark mode, this bit is set to 1.
    58  //
    59  // The smallest possible allocation is 8 bytes. On a 32-bit machine, that
    60  // means every allocated object has two words, so there is room for the
    61  // checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
    62  // just one word, so the second bit pair is not available for encoding the
    63  // checkmark. However, because non-pointer allocations are combined
    64  // into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
    65  // must be a pointer, so the type bit in the first word is not actually needed.
    66  // It is still used in general, except in checkmark the type bit is repurposed
    67  // as the checkmark bit and then reinitialized (to 1) as the type bit when
    68  // finished.
    69  //
    70  
    71  package runtime
    72  
    73  import (
    74  	"runtime/internal/atomic"
    75  	"runtime/internal/sys"
    76  	"unsafe"
    77  )
    78  
    79  const (
    80  	bitPointer = 1 << 0
    81  	bitMarked  = 1 << 4 // TODO: Rename bitScan.
    82  
    83  	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitMarked entries
    84  	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
    85  
    86  	// all mark/pointer bits in a byte
    87  	bitMarkedAll  = bitMarked | bitMarked<<heapBitsShift | bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
    88  	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
    89  )
    90  
    91  // addb returns the byte pointer p+n.
    92  //go:nowritebarrier
    93  //go:nosplit
    94  func addb(p *byte, n uintptr) *byte {
    95  	// Note: wrote out full expression instead of calling add(p, n)
    96  	// to reduce the number of temporaries generated by the
    97  	// compiler for this trivial expression during inlining.
    98  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
    99  }
   100  
   101  // subtractb returns the byte pointer p-n.
   102  // subtractb is typically used when traversing the pointer tables referred to by hbits
   103  // which are arranged in reverse order.
   104  //go:nowritebarrier
   105  //go:nosplit
   106  func subtractb(p *byte, n uintptr) *byte {
   107  	// Note: wrote out full expression instead of calling add(p, -n)
   108  	// to reduce the number of temporaries generated by the
   109  	// compiler for this trivial expression during inlining.
   110  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
   111  }
   112  
   113  // add1 returns the byte pointer p+1.
   114  //go:nowritebarrier
   115  //go:nosplit
   116  func add1(p *byte) *byte {
   117  	// Note: wrote out full expression instead of calling addb(p, 1)
   118  	// to reduce the number of temporaries generated by the
   119  	// compiler for this trivial expression during inlining.
   120  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
   121  }
   122  
   123  // subtract1 returns the byte pointer p-1.
   124  // subtract1 is typically used when traversing the pointer tables referred to by hbits
   125  // which are arranged in reverse order.
   126  //go:nowritebarrier
   127  //
   128  // nosplit because it is used during write barriers and must not be preempted.
   129  //go:nosplit
   130  func subtract1(p *byte) *byte {
   131  	// Note: wrote out full expression instead of calling subtractb(p, 1)
   132  	// to reduce the number of temporaries generated by the
   133  	// compiler for this trivial expression during inlining.
   134  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
   135  }
   136  
   137  // mHeap_MapBits is called each time arena_used is extended.
   138  // It maps any additional bitmap memory needed for the new arena memory.
   139  // It must be called with the expected new value of arena_used,
   140  // *before* h.arena_used has been updated.
   141  // Waiting to update arena_used until after the memory has been mapped
   142  // avoids faults when other threads try access the bitmap immediately
   143  // after observing the change to arena_used.
   144  //
   145  //go:nowritebarrier
   146  func (h *mheap) mapBits(arena_used uintptr) {
   147  	// Caller has added extra mappings to the arena.
   148  	// Add extra mappings of bitmap words as needed.
   149  	// We allocate extra bitmap pieces in chunks of bitmapChunk.
   150  	const bitmapChunk = 8192
   151  
   152  	n := (arena_used - mheap_.arena_start) / heapBitmapScale
   153  	n = round(n, bitmapChunk)
   154  	n = round(n, physPageSize)
   155  	if h.bitmap_mapped >= n {
   156  		return
   157  	}
   158  
   159  	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
   160  	h.bitmap_mapped = n
   161  }
   162  
   163  // heapBits provides access to the bitmap bits for a single heap word.
   164  // The methods on heapBits take value receivers so that the compiler
   165  // can more easily inline calls to those methods and registerize the
   166  // struct fields independently.
   167  type heapBits struct {
   168  	bitp  *uint8
   169  	shift uint32
   170  }
   171  
   172  // markBits provides access to the mark bit for an object in the heap.
   173  // bytep points to the byte holding the mark bit.
   174  // mask is a byte with a single bit set that can be &ed with *bytep
   175  // to see if the bit has been set.
   176  // *m.byte&m.mask != 0 indicates the mark bit is set.
   177  // index can be used along with span information to generate
   178  // the address of the object in the heap.
   179  // We maintain one set of mark bits for allocation and one for
   180  // marking purposes.
   181  type markBits struct {
   182  	bytep *uint8
   183  	mask  uint8
   184  	index uintptr
   185  }
   186  
   187  //go:nosplit
   188  func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
   189  	whichByte := allocBitIndex / 8
   190  	whichBit := allocBitIndex % 8
   191  	bytePtr := addb(s.allocBits, whichByte)
   192  	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
   193  }
   194  
   195  // refillaCache takes 8 bytes s.allocBits starting at whichByte
   196  // and negates them so that ctz (count trailing zeros) instructions
   197  // can be used. It then places these 8 bytes into the cached 64 bit
   198  // s.allocCache.
   199  func (s *mspan) refillAllocCache(whichByte uintptr) {
   200  	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
   201  	aCache := uint64(0)
   202  	aCache |= uint64(bytes[0])
   203  	aCache |= uint64(bytes[1]) << (1 * 8)
   204  	aCache |= uint64(bytes[2]) << (2 * 8)
   205  	aCache |= uint64(bytes[3]) << (3 * 8)
   206  	aCache |= uint64(bytes[4]) << (4 * 8)
   207  	aCache |= uint64(bytes[5]) << (5 * 8)
   208  	aCache |= uint64(bytes[6]) << (6 * 8)
   209  	aCache |= uint64(bytes[7]) << (7 * 8)
   210  	s.allocCache = ^aCache
   211  }
   212  
   213  // nextFreeIndex returns the index of the next free object in s at
   214  // or after s.freeindex.
   215  // There are hardware instructions that can be used to make this
   216  // faster if profiling warrants it.
   217  func (s *mspan) nextFreeIndex() uintptr {
   218  	sfreeindex := s.freeindex
   219  	snelems := s.nelems
   220  	if sfreeindex == snelems {
   221  		return sfreeindex
   222  	}
   223  	if sfreeindex > snelems {
   224  		throw("s.freeindex > s.nelems")
   225  	}
   226  
   227  	aCache := s.allocCache
   228  
   229  	bitIndex := sys.Ctz64(aCache)
   230  	for bitIndex == 64 {
   231  		// Move index to start of next cached bits.
   232  		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
   233  		if sfreeindex >= snelems {
   234  			s.freeindex = snelems
   235  			return snelems
   236  		}
   237  		whichByte := sfreeindex / 8
   238  		// Refill s.allocCache with the next 64 alloc bits.
   239  		s.refillAllocCache(whichByte)
   240  		aCache = s.allocCache
   241  		bitIndex = sys.Ctz64(aCache)
   242  		// nothing available in cached bits
   243  		// grab the next 8 bytes and try again.
   244  	}
   245  	result := sfreeindex + uintptr(bitIndex)
   246  	if result >= snelems {
   247  		s.freeindex = snelems
   248  		return snelems
   249  	}
   250  
   251  	s.allocCache >>= (bitIndex + 1)
   252  	sfreeindex = result + 1
   253  
   254  	if sfreeindex%64 == 0 && sfreeindex != snelems {
   255  		// We just incremented s.freeindex so it isn't 0.
   256  		// As each 1 in s.allocCache was encountered and used for allocation
   257  		// it was shifted away. At this point s.allocCache contains all 0s.
   258  		// Refill s.allocCache so that it corresponds
   259  		// to the bits at s.allocBits starting at s.freeindex.
   260  		whichByte := sfreeindex / 8
   261  		s.refillAllocCache(whichByte)
   262  	}
   263  	s.freeindex = sfreeindex
   264  	return result
   265  }
   266  
   267  func (s *mspan) isFree(index uintptr) bool {
   268  	whichByte := index / 8
   269  	whichBit := index % 8
   270  	byteVal := *addb(s.allocBits, whichByte)
   271  	return byteVal&uint8(1<<whichBit) == 0
   272  }
   273  
   274  func (s *mspan) objIndex(p uintptr) uintptr {
   275  	byteOffset := p - s.base()
   276  	if byteOffset == 0 {
   277  		return 0
   278  	}
   279  	if s.baseMask != 0 {
   280  		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
   281  		return byteOffset >> s.divShift
   282  	}
   283  	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
   284  }
   285  
   286  func markBitsForAddr(p uintptr) markBits {
   287  	s := spanOf(p)
   288  	objIndex := s.objIndex(p)
   289  	return s.markBitsForIndex(objIndex)
   290  }
   291  
   292  func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
   293  	whichByte := objIndex / 8
   294  	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
   295  	bytePtr := addb(s.gcmarkBits, whichByte)
   296  	return markBits{bytePtr, bitMask, objIndex}
   297  }
   298  
   299  func (s *mspan) markBitsForBase() markBits {
   300  	return markBits{s.gcmarkBits, uint8(1), 0}
   301  }
   302  
   303  // isMarked reports whether mark bit m is set.
   304  func (m markBits) isMarked() bool {
   305  	return *m.bytep&m.mask != 0
   306  }
   307  
   308  // setMarked sets the marked bit in the markbits, atomically. Some compilers
   309  // are not able to inline atomic.Or8 function so if it appears as a hot spot consider
   310  // inlining it manually.
   311  func (m markBits) setMarked() {
   312  	// Might be racing with other updates, so use atomic update always.
   313  	// We used to be clever here and use a non-atomic update in certain
   314  	// cases, but it's not worth the risk.
   315  	atomic.Or8(m.bytep, m.mask)
   316  }
   317  
   318  // setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
   319  func (m markBits) setMarkedNonAtomic() {
   320  	*m.bytep |= m.mask
   321  }
   322  
   323  // clearMarked clears the marked bit in the markbits, atomically.
   324  func (m markBits) clearMarked() {
   325  	// Might be racing with other updates, so use atomic update always.
   326  	// We used to be clever here and use a non-atomic update in certain
   327  	// cases, but it's not worth the risk.
   328  	atomic.And8(m.bytep, ^m.mask)
   329  }
   330  
   331  // clearMarkedNonAtomic clears the marked bit non-atomically.
   332  func (m markBits) clearMarkedNonAtomic() {
   333  	*m.bytep ^= m.mask
   334  }
   335  
   336  // markBitsForSpan returns the markBits for the span base address base.
   337  func markBitsForSpan(base uintptr) (mbits markBits) {
   338  	if base < mheap_.arena_start || base >= mheap_.arena_used {
   339  		throw("heapBitsForSpan: base out of range")
   340  	}
   341  	mbits = markBitsForAddr(base)
   342  	if mbits.mask != 1 {
   343  		throw("markBitsForSpan: unaligned start")
   344  	}
   345  	return mbits
   346  }
   347  
   348  // advance advances the markBits to the next object in the span.
   349  func (m *markBits) advance() {
   350  	if m.mask == 1<<7 {
   351  		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
   352  		m.mask = 1
   353  	} else {
   354  		m.mask = m.mask << 1
   355  	}
   356  	m.index++
   357  }
   358  
   359  // heapBitsForAddr returns the heapBits for the address addr.
   360  // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
   361  //
   362  // nosplit because it is used during write barriers and must not be preempted.
   363  //go:nosplit
   364  func heapBitsForAddr(addr uintptr) heapBits {
   365  	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
   366  	off := (addr - mheap_.arena_start) / sys.PtrSize
   367  	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
   368  }
   369  
   370  // heapBitsForSpan returns the heapBits for the span base address base.
   371  func heapBitsForSpan(base uintptr) (hbits heapBits) {
   372  	if base < mheap_.arena_start || base >= mheap_.arena_used {
   373  		throw("heapBitsForSpan: base out of range")
   374  	}
   375  	return heapBitsForAddr(base)
   376  }
   377  
   378  // heapBitsForObject returns the base address for the heap object
   379  // containing the address p, the heapBits for base,
   380  // the object's span, and of the index of the object in s.
   381  // If p does not point into a heap object,
   382  // return base == 0
   383  // otherwise return the base of the object.
   384  //
   385  // refBase and refOff optionally give the base address of the object
   386  // in which the pointer p was found and the byte offset at which it
   387  // was found. These are used for error reporting.
   388  func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
   389  	arenaStart := mheap_.arena_start
   390  	if p < arenaStart || p >= mheap_.arena_used {
   391  		return
   392  	}
   393  	off := p - arenaStart
   394  	idx := off >> _PageShift
   395  	// p points into the heap, but possibly to the middle of an object.
   396  	// Consult the span table to find the block beginning.
   397  	s = h_spans[idx]
   398  	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
   399  		if s == nil || s.state == _MSpanStack {
   400  			// If s is nil, the virtual address has never been part of the heap.
   401  			// This pointer may be to some mmap'd region, so we allow it.
   402  			// Pointers into stacks are also ok, the runtime manages these explicitly.
   403  			return
   404  		}
   405  
   406  		// The following ensures that we are rigorous about what data
   407  		// structures hold valid pointers.
   408  		if debug.invalidptr != 0 {
   409  			// Typically this indicates an incorrect use
   410  			// of unsafe or cgo to store a bad pointer in
   411  			// the Go heap. It may also indicate a runtime
   412  			// bug.
   413  			//
   414  			// TODO(austin): We could be more aggressive
   415  			// and detect pointers to unallocated objects
   416  			// in allocated spans.
   417  			printlock()
   418  			print("runtime: pointer ", hex(p))
   419  			if s.state != mSpanInUse {
   420  				print(" to unallocated span")
   421  			} else {
   422  				print(" to unused region of span")
   423  			}
   424  			print("idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
   425  			if refBase != 0 {
   426  				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
   427  				gcDumpObject("object", refBase, refOff)
   428  			}
   429  			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
   430  		}
   431  		return
   432  	}
   433  	// If this span holds object of a power of 2 size, just mask off the bits to
   434  	// the interior of the object. Otherwise use the size to get the base.
   435  	if s.baseMask != 0 {
   436  		// optimize for power of 2 sized objects.
   437  		base = s.base()
   438  		base = base + (p-base)&s.baseMask
   439  		objIndex = (base - s.base()) >> s.divShift
   440  		// base = p & s.baseMask is faster for small spans,
   441  		// but doesn't work for large spans.
   442  		// Overall, it's faster to use the more general computation above.
   443  	} else {
   444  		base = s.base()
   445  		if p-base >= s.elemsize {
   446  			// n := (p - base) / s.elemsize, using division by multiplication
   447  			objIndex = uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
   448  			base += objIndex * s.elemsize
   449  		}
   450  	}
   451  	// Now that we know the actual base, compute heapBits to return to caller.
   452  	hbits = heapBitsForAddr(base)
   453  	return
   454  }
   455  
   456  // prefetch the bits.
   457  func (h heapBits) prefetch() {
   458  	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
   459  }
   460  
   461  // next returns the heapBits describing the next pointer-sized word in memory.
   462  // That is, if h describes address p, h.next() describes p+ptrSize.
   463  // Note that next does not modify h. The caller must record the result.
   464  //
   465  // nosplit because it is used during write barriers and must not be preempted.
   466  //go:nosplit
   467  func (h heapBits) next() heapBits {
   468  	if h.shift < 3*heapBitsShift {
   469  		return heapBits{h.bitp, h.shift + heapBitsShift}
   470  	}
   471  	return heapBits{subtract1(h.bitp), 0}
   472  }
   473  
   474  // forward returns the heapBits describing n pointer-sized words ahead of h in memory.
   475  // That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
   476  // h.forward(1) is equivalent to h.next(), just slower.
   477  // Note that forward does not modify h. The caller must record the result.
   478  // bits returns the heap bits for the current word.
   479  func (h heapBits) forward(n uintptr) heapBits {
   480  	n += uintptr(h.shift) / heapBitsShift
   481  	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
   482  }
   483  
   484  // The caller can test isMarked and isPointer by &-ing with bitMarked and bitPointer.
   485  // The result includes in its higher bits the bits for subsequent words
   486  // described by the same bitmap byte.
   487  func (h heapBits) bits() uint32 {
   488  	// The (shift & 31) eliminates a test and conditional branch
   489  	// from the generated code.
   490  	return uint32(*h.bitp) >> (h.shift & 31)
   491  }
   492  
   493  // morePointers returns true if this word and all remaining words in this object
   494  // are scalars.
   495  // h must not describe the second word of the object.
   496  func (h heapBits) morePointers() bool {
   497  	return h.bits()&bitMarked != 0
   498  }
   499  
   500  // isPointer reports whether the heap bits describe a pointer word.
   501  //
   502  // nosplit because it is used during write barriers and must not be preempted.
   503  //go:nosplit
   504  func (h heapBits) isPointer() bool {
   505  	return h.bits()&bitPointer != 0
   506  }
   507  
   508  // hasPointers reports whether the given object has any pointers.
   509  // It must be told how large the object at h is for efficiency.
   510  // h must describe the initial word of the object.
   511  func (h heapBits) hasPointers(size uintptr) bool {
   512  	if size == sys.PtrSize { // 1-word objects are always pointers
   513  		return true
   514  	}
   515  	return (*h.bitp>>h.shift)&bitMarked != 0
   516  }
   517  
   518  // isCheckmarked reports whether the heap bits have the checkmarked bit set.
   519  // It must be told how large the object at h is, because the encoding of the
   520  // checkmark bit varies by size.
   521  // h must describe the initial word of the object.
   522  func (h heapBits) isCheckmarked(size uintptr) bool {
   523  	if size == sys.PtrSize {
   524  		return (*h.bitp>>h.shift)&bitPointer != 0
   525  	}
   526  	// All multiword objects are 2-word aligned,
   527  	// so we know that the initial word's 2-bit pair
   528  	// and the second word's 2-bit pair are in the
   529  	// same heap bitmap byte, *h.bitp.
   530  	return (*h.bitp>>(heapBitsShift+h.shift))&bitMarked != 0
   531  }
   532  
   533  // setCheckmarked sets the checkmarked bit.
   534  // It must be told how large the object at h is, because the encoding of the
   535  // checkmark bit varies by size.
   536  // h must describe the initial word of the object.
   537  func (h heapBits) setCheckmarked(size uintptr) {
   538  	if size == sys.PtrSize {
   539  		atomic.Or8(h.bitp, bitPointer<<h.shift)
   540  		return
   541  	}
   542  	atomic.Or8(h.bitp, bitMarked<<(heapBitsShift+h.shift))
   543  }
   544  
   545  // heapBitsBulkBarrier executes writebarrierptr_nostore
   546  // for every pointer slot in the memory range [p, p+size),
   547  // using the heap, data, or BSS bitmap to locate those pointer slots.
   548  // This executes the write barriers necessary after a memmove.
   549  // Both p and size must be pointer-aligned.
   550  // The range [p, p+size) must lie within a single object.
   551  //
   552  // Callers should call heapBitsBulkBarrier immediately after
   553  // calling memmove(p, src, size). This function is marked nosplit
   554  // to avoid being preempted; the GC must not stop the goroutine
   555  // between the memmove and the execution of the barriers.
   556  //
   557  // The heap bitmap is not maintained for allocations containing
   558  // no pointers at all; any caller of heapBitsBulkBarrier must first
   559  // make sure the underlying allocation contains pointers, usually
   560  // by checking typ.kind&kindNoPointers.
   561  //
   562  //go:nosplit
   563  func heapBitsBulkBarrier(p, size uintptr) {
   564  	if (p|size)&(sys.PtrSize-1) != 0 {
   565  		throw("heapBitsBulkBarrier: unaligned arguments")
   566  	}
   567  	if !writeBarrier.needed {
   568  		return
   569  	}
   570  	if !inheap(p) {
   571  		// If p is on the stack and in a higher frame than the
   572  		// caller, we either need to execute write barriers on
   573  		// it (which is what happens for normal stack writes
   574  		// through pointers to higher frames), or we need to
   575  		// force the mark termination stack scan to scan the
   576  		// frame containing p.
   577  		//
   578  		// Executing write barriers on p is complicated in the
   579  		// general case because we either need to unwind the
   580  		// stack to get the stack map, or we need the type's
   581  		// bitmap, which may be a GC program.
   582  		//
   583  		// Hence, we opt for forcing the re-scan to scan the
   584  		// frame containing p, which we can do by simply
   585  		// unwinding the stack barriers between the current SP
   586  		// and p's frame.
   587  		gp := getg().m.curg
   588  		if gp != nil && gp.stack.lo <= p && p < gp.stack.hi {
   589  			// Run on the system stack to give it more
   590  			// stack space.
   591  			systemstack(func() {
   592  				gcUnwindBarriers(gp, p)
   593  			})
   594  			return
   595  		}
   596  
   597  		// If p is a global, use the data or BSS bitmaps to
   598  		// execute write barriers.
   599  		for datap := &firstmoduledata; datap != nil; datap = datap.next {
   600  			if datap.data <= p && p < datap.edata {
   601  				bulkBarrierBitmap(p, size, p-datap.data, datap.gcdatamask.bytedata)
   602  				return
   603  			}
   604  		}
   605  		for datap := &firstmoduledata; datap != nil; datap = datap.next {
   606  			if datap.bss <= p && p < datap.ebss {
   607  				bulkBarrierBitmap(p, size, p-datap.bss, datap.gcbssmask.bytedata)
   608  				return
   609  			}
   610  		}
   611  		return
   612  	}
   613  
   614  	h := heapBitsForAddr(p)
   615  	for i := uintptr(0); i < size; i += sys.PtrSize {
   616  		if h.isPointer() {
   617  			x := (*uintptr)(unsafe.Pointer(p + i))
   618  			writebarrierptr_nostore(x, *x)
   619  		}
   620  		h = h.next()
   621  	}
   622  }
   623  
   624  // bulkBarrierBitmap executes write barriers for [p, p+size) using a
   625  // 1-bit pointer bitmap. p is assumed to start maskOffset bytes into
   626  // the data covered by the bitmap in bits.
   627  //
   628  // This is used by heapBitsBulkBarrier for writes to data and BSS.
   629  //
   630  //go:nosplit
   631  func bulkBarrierBitmap(p, size, maskOffset uintptr, bits *uint8) {
   632  	word := maskOffset / sys.PtrSize
   633  	bits = addb(bits, word/8)
   634  	mask := uint8(1) << (word % 8)
   635  
   636  	for i := uintptr(0); i < size; i += sys.PtrSize {
   637  		if mask == 0 {
   638  			bits = addb(bits, 1)
   639  			if *bits == 0 {
   640  				// Skip 8 words.
   641  				i += 7 * sys.PtrSize
   642  				continue
   643  			}
   644  			mask = 1
   645  		}
   646  		if *bits&mask != 0 {
   647  			x := (*uintptr)(unsafe.Pointer(p + i))
   648  			writebarrierptr_nostore(x, *x)
   649  		}
   650  		mask <<= 1
   651  	}
   652  }
   653  
   654  // typeBitsBulkBarrier executes writebarrierptr_nostore
   655  // for every pointer slot in the memory range [p, p+size),
   656  // using the type bitmap to locate those pointer slots.
   657  // The type typ must correspond exactly to [p, p+size).
   658  // This executes the write barriers necessary after a copy.
   659  // Both p and size must be pointer-aligned.
   660  // The type typ must have a plain bitmap, not a GC program.
   661  // The only use of this function is in channel sends, and the
   662  // 64 kB channel element limit takes care of this for us.
   663  //
   664  // Must not be preempted because it typically runs right after memmove,
   665  // and the GC must not complete between those two.
   666  //
   667  //go:nosplit
   668  func typeBitsBulkBarrier(typ *_type, p, size uintptr) {
   669  	if typ == nil {
   670  		throw("runtime: typeBitsBulkBarrier without type")
   671  	}
   672  	if typ.size != size {
   673  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
   674  		throw("runtime: invalid typeBitsBulkBarrier")
   675  	}
   676  	if typ.kind&kindGCProg != 0 {
   677  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
   678  		throw("runtime: invalid typeBitsBulkBarrier")
   679  	}
   680  	if !writeBarrier.needed {
   681  		return
   682  	}
   683  	ptrmask := typ.gcdata
   684  	var bits uint32
   685  	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
   686  		if i&(sys.PtrSize*8-1) == 0 {
   687  			bits = uint32(*ptrmask)
   688  			ptrmask = addb(ptrmask, 1)
   689  		} else {
   690  			bits = bits >> 1
   691  		}
   692  		if bits&1 != 0 {
   693  			x := (*uintptr)(unsafe.Pointer(p + i))
   694  			writebarrierptr_nostore(x, *x)
   695  		}
   696  	}
   697  }
   698  
   699  // The methods operating on spans all require that h has been returned
   700  // by heapBitsForSpan and that size, n, total are the span layout description
   701  // returned by the mspan's layout method.
   702  // If total > size*n, it means that there is extra leftover memory in the span,
   703  // usually due to rounding.
   704  //
   705  // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
   706  
   707  // initSpan initializes the heap bitmap for a span.
   708  // It clears all checkmark bits.
   709  // If this is a span of pointer-sized objects, it initializes all
   710  // words to pointer/scan.
   711  // Otherwise, it initializes all words to scalar/dead.
   712  func (h heapBits) initSpan(s *mspan) {
   713  	size, n, total := s.layout()
   714  
   715  	// Init the markbit structures
   716  	s.freeindex = 0
   717  	s.allocCache = ^uint64(0) // all 1s indicating all free.
   718  	s.nelems = n
   719  	s.allocBits = nil
   720  	s.gcmarkBits = nil
   721  	s.gcmarkBits = newMarkBits(s.nelems)
   722  	s.allocBits = newAllocBits(s.nelems)
   723  
   724  	// Clear bits corresponding to objects.
   725  	if total%heapBitmapScale != 0 {
   726  		throw("initSpan: unaligned length")
   727  	}
   728  	nbyte := total / heapBitmapScale
   729  	if sys.PtrSize == 8 && size == sys.PtrSize {
   730  		end := h.bitp
   731  		bitp := subtractb(end, nbyte-1)
   732  		for {
   733  			*bitp = bitPointerAll | bitMarkedAll
   734  			if bitp == end {
   735  				break
   736  			}
   737  			bitp = add1(bitp)
   738  		}
   739  		return
   740  	}
   741  	memclr(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
   742  }
   743  
   744  // initCheckmarkSpan initializes a span for being checkmarked.
   745  // It clears the checkmark bits, which are set to 1 in normal operation.
   746  func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
   747  	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
   748  	if sys.PtrSize == 8 && size == sys.PtrSize {
   749  		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
   750  		// Only possible on 64-bit system, since minimum size is 8.
   751  		// Must clear type bit (checkmark bit) of every word.
   752  		// The type bit is the lower of every two-bit pair.
   753  		bitp := h.bitp
   754  		for i := uintptr(0); i < n; i += 4 {
   755  			*bitp &^= bitPointerAll
   756  			bitp = subtract1(bitp)
   757  		}
   758  		return
   759  	}
   760  	for i := uintptr(0); i < n; i++ {
   761  		*h.bitp &^= bitMarked << (heapBitsShift + h.shift)
   762  		h = h.forward(size / sys.PtrSize)
   763  	}
   764  }
   765  
   766  // clearCheckmarkSpan undoes all the checkmarking in a span.
   767  // The actual checkmark bits are ignored, so the only work to do
   768  // is to fix the pointer bits. (Pointer bits are ignored by scanobject
   769  // but consulted by typedmemmove.)
   770  func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
   771  	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
   772  	if sys.PtrSize == 8 && size == sys.PtrSize {
   773  		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
   774  		// Only possible on 64-bit system, since minimum size is 8.
   775  		// Must clear type bit (checkmark bit) of every word.
   776  		// The type bit is the lower of every two-bit pair.
   777  		bitp := h.bitp
   778  		for i := uintptr(0); i < n; i += 4 {
   779  			*bitp |= bitPointerAll
   780  			bitp = subtract1(bitp)
   781  		}
   782  	}
   783  }
   784  
   785  // oneBitCount is indexed by byte and produces the
   786  // number of 1 bits in that byte. For example 128 has 1 bit set
   787  // and oneBitCount[128] will holds 1.
   788  var oneBitCount = [256]uint8{
   789  	0, 1, 1, 2, 1, 2, 2, 3,
   790  	1, 2, 2, 3, 2, 3, 3, 4,
   791  	1, 2, 2, 3, 2, 3, 3, 4,
   792  	2, 3, 3, 4, 3, 4, 4, 5,
   793  	1, 2, 2, 3, 2, 3, 3, 4,
   794  	2, 3, 3, 4, 3, 4, 4, 5,
   795  	2, 3, 3, 4, 3, 4, 4, 5,
   796  	3, 4, 4, 5, 4, 5, 5, 6,
   797  	1, 2, 2, 3, 2, 3, 3, 4,
   798  	2, 3, 3, 4, 3, 4, 4, 5,
   799  	2, 3, 3, 4, 3, 4, 4, 5,
   800  	3, 4, 4, 5, 4, 5, 5, 6,
   801  	2, 3, 3, 4, 3, 4, 4, 5,
   802  	3, 4, 4, 5, 4, 5, 5, 6,
   803  	3, 4, 4, 5, 4, 5, 5, 6,
   804  	4, 5, 5, 6, 5, 6, 6, 7,
   805  	1, 2, 2, 3, 2, 3, 3, 4,
   806  	2, 3, 3, 4, 3, 4, 4, 5,
   807  	2, 3, 3, 4, 3, 4, 4, 5,
   808  	3, 4, 4, 5, 4, 5, 5, 6,
   809  	2, 3, 3, 4, 3, 4, 4, 5,
   810  	3, 4, 4, 5, 4, 5, 5, 6,
   811  	3, 4, 4, 5, 4, 5, 5, 6,
   812  	4, 5, 5, 6, 5, 6, 6, 7,
   813  	2, 3, 3, 4, 3, 4, 4, 5,
   814  	3, 4, 4, 5, 4, 5, 5, 6,
   815  	3, 4, 4, 5, 4, 5, 5, 6,
   816  	4, 5, 5, 6, 5, 6, 6, 7,
   817  	3, 4, 4, 5, 4, 5, 5, 6,
   818  	4, 5, 5, 6, 5, 6, 6, 7,
   819  	4, 5, 5, 6, 5, 6, 6, 7,
   820  	5, 6, 6, 7, 6, 7, 7, 8}
   821  
   822  // countFree runs through the mark bits in a span and counts the number of free objects
   823  // in the span.
   824  // TODO:(rlh) Use popcount intrinsic.
   825  func (s *mspan) countFree() int {
   826  	count := 0
   827  	maxIndex := s.nelems / 8
   828  	for i := uintptr(0); i < maxIndex; i++ {
   829  		mrkBits := *addb(s.gcmarkBits, i)
   830  		count += int(oneBitCount[mrkBits])
   831  	}
   832  	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
   833  		mrkBits := *addb(s.gcmarkBits, maxIndex)
   834  		mask := uint8((1 << bitsInLastByte) - 1)
   835  		bits := mrkBits & mask
   836  		count += int(oneBitCount[bits])
   837  	}
   838  	return int(s.nelems) - count
   839  }
   840  
   841  // heapBitsSetType records that the new allocation [x, x+size)
   842  // holds in [x, x+dataSize) one or more values of type typ.
   843  // (The number of values is given by dataSize / typ.size.)
   844  // If dataSize < size, the fragment [x+dataSize, x+size) is
   845  // recorded as non-pointer data.
   846  // It is known that the type has pointers somewhere;
   847  // malloc does not call heapBitsSetType when there are no pointers,
   848  // because all free objects are marked as noscan during
   849  // heapBitsSweepSpan.
   850  //
   851  // There can only be one allocation from a given span active at a time,
   852  // and the bitmap for a span always falls on byte boundaries,
   853  // so there are no write-write races for access to the heap bitmap.
   854  // Hence, heapBitsSetType can access the bitmap without atomics.
   855  //
   856  // There can be read-write races between heapBitsSetType and things
   857  // that read the heap bitmap like scanobject. However, since
   858  // heapBitsSetType is only used for objects that have not yet been
   859  // made reachable, readers will ignore bits being modified by this
   860  // function. This does mean this function cannot transiently modify
   861  // bits that belong to neighboring objects. Also, on weakly-ordered
   862  // machines, callers must execute a store/store (publication) barrier
   863  // between calling this function and making the object reachable.
   864  //
   865  // TODO: This still has atomic accesses left over from when it could
   866  // race with GC accessing mark bits in the bitmap. Remove these.
   867  func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
   868  	const doubleCheck = false // slow but helpful; enable to test modifications to this code
   869  
   870  	// dataSize is always size rounded up to the next malloc size class,
   871  	// except in the case of allocating a defer block, in which case
   872  	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
   873  	// arbitrarily larger.
   874  	//
   875  	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
   876  	// assume that dataSize == size without checking it explicitly.
   877  
   878  	if sys.PtrSize == 8 && size == sys.PtrSize {
   879  		// It's one word and it has pointers, it must be a pointer.
   880  		// In general we'd need an atomic update here if the
   881  		// concurrent GC were marking objects in this span,
   882  		// because each bitmap byte describes 3 other objects
   883  		// in addition to the one being allocated.
   884  		// However, since all allocated one-word objects are pointers
   885  		// (non-pointers are aggregated into tinySize allocations),
   886  		// initSpan sets the pointer bits for us. Nothing to do here.
   887  		if doubleCheck {
   888  			h := heapBitsForAddr(x)
   889  			if !h.isPointer() {
   890  				throw("heapBitsSetType: pointer bit missing")
   891  			}
   892  			if !h.morePointers() {
   893  				throw("heapBitsSetType: scan bit missing")
   894  			}
   895  		}
   896  		return
   897  	}
   898  
   899  	h := heapBitsForAddr(x)
   900  	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
   901  
   902  	// Heap bitmap bits for 2-word object are only 4 bits,
   903  	// so also shared with objects next to it; use atomic updates.
   904  	// This is called out as a special case primarily for 32-bit systems,
   905  	// so that on 32-bit systems the code below can assume all objects
   906  	// are 4-word aligned (because they're all 16-byte aligned).
   907  	if size == 2*sys.PtrSize {
   908  		if typ.size == sys.PtrSize {
   909  			// We're allocating a block big enough to hold two pointers.
   910  			// On 64-bit, that means the actual object must be two pointers,
   911  			// or else we'd have used the one-pointer-sized block.
   912  			// On 32-bit, however, this is the 8-byte block, the smallest one.
   913  			// So it could be that we're allocating one pointer and this was
   914  			// just the smallest block available. Distinguish by checking dataSize.
   915  			// (In general the number of instances of typ being allocated is
   916  			// dataSize/typ.size.)
   917  			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
   918  				// 1 pointer object. On 32-bit machines clear the bit for the
   919  				// unused second word.
   920  				if gcphase == _GCoff {
   921  					*h.bitp &^= (bitPointer | bitMarked | ((bitPointer | bitMarked) << heapBitsShift)) << h.shift
   922  					*h.bitp |= (bitPointer | bitMarked) << h.shift
   923  				} else {
   924  					atomic.And8(h.bitp, ^uint8((bitPointer|bitMarked|((bitPointer|bitMarked)<<heapBitsShift))<<h.shift))
   925  					atomic.Or8(h.bitp, (bitPointer|bitMarked)<<h.shift)
   926  				}
   927  			} else {
   928  				// 2-element slice of pointer.
   929  				if gcphase == _GCoff {
   930  					*h.bitp |= (bitPointer | bitMarked | bitPointer<<heapBitsShift) << h.shift
   931  				} else {
   932  					atomic.Or8(h.bitp, (bitPointer|bitMarked|bitPointer<<heapBitsShift)<<h.shift)
   933  				}
   934  			}
   935  			return
   936  		}
   937  		// Otherwise typ.size must be 2*sys.PtrSize,
   938  		// and typ.kind&kindGCProg == 0.
   939  		if doubleCheck {
   940  			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
   941  				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
   942  				throw("heapBitsSetType")
   943  			}
   944  		}
   945  		b := uint32(*ptrmask)
   946  		hb := (b & 3) | bitMarked
   947  		if gcphase == _GCoff {
   948  			// bitPointer == 1, bitMarked is 1 << 4, heapBitsShift is 1.
   949  			// 110011 is shifted h.shift and complemented.
   950  			// This clears out the bits that are about to be
   951  			// ored into *h.hbitp in the next instructions.
   952  			*h.bitp &^= (bitPointer | bitMarked | ((bitPointer | bitMarked) << heapBitsShift)) << h.shift
   953  			*h.bitp |= uint8(hb << h.shift)
   954  		} else {
   955  			// TODO:(rlh) since the GC is not concurrently setting the
   956  			// mark bits in the heap map anymore and malloc
   957  			// owns the span we are allocating in why does this have
   958  			// to be atomic?
   959  
   960  			atomic.And8(h.bitp, ^uint8((bitPointer|bitMarked|((bitPointer|bitMarked)<<heapBitsShift))<<h.shift))
   961  			atomic.Or8(h.bitp, uint8(hb<<h.shift))
   962  		}
   963  		return
   964  	}
   965  
   966  	// Copy from 1-bit ptrmask into 2-bit bitmap.
   967  	// The basic approach is to use a single uintptr as a bit buffer,
   968  	// alternating between reloading the buffer and writing bitmap bytes.
   969  	// In general, one load can supply two bitmap byte writes.
   970  	// This is a lot of lines of code, but it compiles into relatively few
   971  	// machine instructions.
   972  
   973  	var (
   974  		// Ptrmask input.
   975  		p     *byte   // last ptrmask byte read
   976  		b     uintptr // ptrmask bits already loaded
   977  		nb    uintptr // number of bits in b at next read
   978  		endp  *byte   // final ptrmask byte to read (then repeat)
   979  		endnb uintptr // number of valid bits in *endp
   980  		pbits uintptr // alternate source of bits
   981  
   982  		// Heap bitmap output.
   983  		w     uintptr // words processed
   984  		nw    uintptr // number of words to process
   985  		hbitp *byte   // next heap bitmap byte to write
   986  		hb    uintptr // bits being prepared for *hbitp
   987  	)
   988  
   989  	hbitp = h.bitp
   990  
   991  	// Handle GC program. Delayed until this part of the code
   992  	// so that we can use the same double-checking mechanism
   993  	// as the 1-bit case. Nothing above could have encountered
   994  	// GC programs: the cases were all too small.
   995  	if typ.kind&kindGCProg != 0 {
   996  		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
   997  		if doubleCheck {
   998  			// Double-check the heap bits written by GC program
   999  			// by running the GC program to create a 1-bit pointer mask
  1000  			// and then jumping to the double-check code below.
  1001  			// This doesn't catch bugs shared between the 1-bit and 4-bit
  1002  			// GC program execution, but it does catch mistakes specific
  1003  			// to just one of those and bugs in heapBitsSetTypeGCProg's
  1004  			// implementation of arrays.
  1005  			lock(&debugPtrmask.lock)
  1006  			if debugPtrmask.data == nil {
  1007  				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
  1008  			}
  1009  			ptrmask = debugPtrmask.data
  1010  			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
  1011  			goto Phase4
  1012  		}
  1013  		return
  1014  	}
  1015  
  1016  	// Note about sizes:
  1017  	//
  1018  	// typ.size is the number of words in the object,
  1019  	// and typ.ptrdata is the number of words in the prefix
  1020  	// of the object that contains pointers. That is, the final
  1021  	// typ.size - typ.ptrdata words contain no pointers.
  1022  	// This allows optimization of a common pattern where
  1023  	// an object has a small header followed by a large scalar
  1024  	// buffer. If we know the pointers are over, we don't have
  1025  	// to scan the buffer's heap bitmap at all.
  1026  	// The 1-bit ptrmasks are sized to contain only bits for
  1027  	// the typ.ptrdata prefix, zero padded out to a full byte
  1028  	// of bitmap. This code sets nw (below) so that heap bitmap
  1029  	// bits are only written for the typ.ptrdata prefix; if there is
  1030  	// more room in the allocated object, the next heap bitmap
  1031  	// entry is a 00, indicating that there are no more pointers
  1032  	// to scan. So only the ptrmask for the ptrdata bytes is needed.
  1033  	//
  1034  	// Replicated copies are not as nice: if there is an array of
  1035  	// objects with scalar tails, all but the last tail does have to
  1036  	// be initialized, because there is no way to say "skip forward".
  1037  	// However, because of the possibility of a repeated type with
  1038  	// size not a multiple of 4 pointers (one heap bitmap byte),
  1039  	// the code already must handle the last ptrmask byte specially
  1040  	// by treating it as containing only the bits for endnb pointers,
  1041  	// where endnb <= 4. We represent large scalar tails that must
  1042  	// be expanded in the replication by setting endnb larger than 4.
  1043  	// This will have the effect of reading many bits out of b,
  1044  	// but once the real bits are shifted out, b will supply as many
  1045  	// zero bits as we try to read, which is exactly what we need.
  1046  
  1047  	p = ptrmask
  1048  	if typ.size < dataSize {
  1049  		// Filling in bits for an array of typ.
  1050  		// Set up for repetition of ptrmask during main loop.
  1051  		// Note that ptrmask describes only a prefix of
  1052  		const maxBits = sys.PtrSize*8 - 7
  1053  		if typ.ptrdata/sys.PtrSize <= maxBits {
  1054  			// Entire ptrmask fits in uintptr with room for a byte fragment.
  1055  			// Load into pbits and never read from ptrmask again.
  1056  			// This is especially important when the ptrmask has
  1057  			// fewer than 8 bits in it; otherwise the reload in the middle
  1058  			// of the Phase 2 loop would itself need to loop to gather
  1059  			// at least 8 bits.
  1060  
  1061  			// Accumulate ptrmask into b.
  1062  			// ptrmask is sized to describe only typ.ptrdata, but we record
  1063  			// it as describing typ.size bytes, since all the high bits are zero.
  1064  			nb = typ.ptrdata / sys.PtrSize
  1065  			for i := uintptr(0); i < nb; i += 8 {
  1066  				b |= uintptr(*p) << i
  1067  				p = add1(p)
  1068  			}
  1069  			nb = typ.size / sys.PtrSize
  1070  
  1071  			// Replicate ptrmask to fill entire pbits uintptr.
  1072  			// Doubling and truncating is fewer steps than
  1073  			// iterating by nb each time. (nb could be 1.)
  1074  			// Since we loaded typ.ptrdata/sys.PtrSize bits
  1075  			// but are pretending to have typ.size/sys.PtrSize,
  1076  			// there might be no replication necessary/possible.
  1077  			pbits = b
  1078  			endnb = nb
  1079  			if nb+nb <= maxBits {
  1080  				for endnb <= sys.PtrSize*8 {
  1081  					pbits |= pbits << endnb
  1082  					endnb += endnb
  1083  				}
  1084  				// Truncate to a multiple of original ptrmask.
  1085  				endnb = maxBits / nb * nb
  1086  				pbits &= 1<<endnb - 1
  1087  				b = pbits
  1088  				nb = endnb
  1089  			}
  1090  
  1091  			// Clear p and endp as sentinel for using pbits.
  1092  			// Checked during Phase 2 loop.
  1093  			p = nil
  1094  			endp = nil
  1095  		} else {
  1096  			// Ptrmask is larger. Read it multiple times.
  1097  			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
  1098  			endp = addb(ptrmask, n)
  1099  			endnb = typ.size/sys.PtrSize - n*8
  1100  		}
  1101  	}
  1102  	if p != nil {
  1103  		b = uintptr(*p)
  1104  		p = add1(p)
  1105  		nb = 8
  1106  	}
  1107  
  1108  	if typ.size == dataSize {
  1109  		// Single entry: can stop once we reach the non-pointer data.
  1110  		nw = typ.ptrdata / sys.PtrSize
  1111  	} else {
  1112  		// Repeated instances of typ in an array.
  1113  		// Have to process first N-1 entries in full, but can stop
  1114  		// once we reach the non-pointer data in the final entry.
  1115  		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
  1116  	}
  1117  	if nw == 0 {
  1118  		// No pointers! Caller was supposed to check.
  1119  		println("runtime: invalid type ", typ.string())
  1120  		throw("heapBitsSetType: called with non-pointer type")
  1121  		return
  1122  	}
  1123  	if nw < 2 {
  1124  		// Must write at least 2 words, because the "no scan"
  1125  		// encoding doesn't take effect until the third word.
  1126  		nw = 2
  1127  	}
  1128  
  1129  	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
  1130  	// The leading byte is special because it contains the bits for word 1,
  1131  	// which does not have the marked bits set.
  1132  	// The leading half-byte is special because it's a half a byte and must be
  1133  	// manipulated atomically.
  1134  	switch {
  1135  	default:
  1136  		throw("heapBitsSetType: unexpected shift")
  1137  
  1138  	case h.shift == 0:
  1139  		// Ptrmask and heap bitmap are aligned.
  1140  		// Handle first byte of bitmap specially.
  1141  		//
  1142  		// The first byte we write out covers the first four
  1143  		// words of the object. The scan/dead bit on the first
  1144  		// word must be set to scan since there are pointers
  1145  		// somewhere in the object. The scan/dead bit on the
  1146  		// second word is the checkmark, so we don't set it.
  1147  		// In all following words, we set the scan/dead
  1148  		// appropriately to indicate that the object contains
  1149  		// to the next 2-bit entry in the bitmap.
  1150  		//
  1151  		// TODO: It doesn't matter if we set the checkmark, so
  1152  		// maybe this case isn't needed any more.
  1153  		hb = b & bitPointerAll
  1154  		hb |= bitMarked | bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
  1155  		if w += 4; w >= nw {
  1156  			goto Phase3
  1157  		}
  1158  		*hbitp = uint8(hb)
  1159  		hbitp = subtract1(hbitp)
  1160  		b >>= 4
  1161  		nb -= 4
  1162  
  1163  	case sys.PtrSize == 8 && h.shift == 2:
  1164  		// Ptrmask and heap bitmap are misaligned.
  1165  		// The bits for the first two words are in a byte shared with another object
  1166  		// and must be updated atomically.
  1167  		// NOTE(rsc): The atomic here may not be necessary.
  1168  		// We took care of 1-word and 2-word objects above,
  1169  		// so this is at least a 6-word object, so our start bits
  1170  		// are shared only with the type bits of another object,
  1171  		// not with its mark bit. Since there is only one allocation
  1172  		// from a given span at a time, we should be able to set
  1173  		// these bits non-atomically. Not worth the risk right now.
  1174  		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
  1175  		// This is not noscan, so set the scan bit in the
  1176  		// first word.
  1177  		hb |= bitMarked << (2 * heapBitsShift)
  1178  		b >>= 2
  1179  		nb -= 2
  1180  		// Note: no bitMarker for second word because that's
  1181  		// the checkmark.
  1182  		if gcphase == _GCoff {
  1183  			*hbitp &^= uint8((bitPointer | bitMarked | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
  1184  			*hbitp |= uint8(hb)
  1185  		} else {
  1186  			atomic.And8(hbitp, ^(uint8(bitPointer|bitMarked|bitPointer<<heapBitsShift) << (2 * heapBitsShift)))
  1187  			atomic.Or8(hbitp, uint8(hb))
  1188  		}
  1189  		hbitp = subtract1(hbitp)
  1190  		if w += 2; w >= nw {
  1191  			// We know that there is more data, because we handled 2-word objects above.
  1192  			// This must be at least a 6-word object. If we're out of pointer words,
  1193  			// mark no scan in next bitmap byte and finish.
  1194  			hb = 0
  1195  			w += 4
  1196  			goto Phase3
  1197  		}
  1198  	}
  1199  
  1200  	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
  1201  	// The loop computes the bits for that last write but does not execute the write;
  1202  	// it leaves the bits in hb for processing by phase 3.
  1203  	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
  1204  	// use in the first half of the loop right now, and then we only adjust nb explicitly
  1205  	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
  1206  	nb -= 4
  1207  	for {
  1208  		// Emit bitmap byte.
  1209  		// b has at least nb+4 bits, with one exception:
  1210  		// if w+4 >= nw, then b has only nw-w bits,
  1211  		// but we'll stop at the break and then truncate
  1212  		// appropriately in Phase 3.
  1213  		hb = b & bitPointerAll
  1214  		hb |= bitMarkedAll
  1215  		if w += 4; w >= nw {
  1216  			break
  1217  		}
  1218  		*hbitp = uint8(hb)
  1219  		hbitp = subtract1(hbitp)
  1220  		b >>= 4
  1221  
  1222  		// Load more bits. b has nb right now.
  1223  		if p != endp {
  1224  			// Fast path: keep reading from ptrmask.
  1225  			// nb unmodified: we just loaded 8 bits,
  1226  			// and the next iteration will consume 8 bits,
  1227  			// leaving us with the same nb the next time we're here.
  1228  			if nb < 8 {
  1229  				b |= uintptr(*p) << nb
  1230  				p = add1(p)
  1231  			} else {
  1232  				// Reduce the number of bits in b.
  1233  				// This is important if we skipped
  1234  				// over a scalar tail, since nb could
  1235  				// be larger than the bit width of b.
  1236  				nb -= 8
  1237  			}
  1238  		} else if p == nil {
  1239  			// Almost as fast path: track bit count and refill from pbits.
  1240  			// For short repetitions.
  1241  			if nb < 8 {
  1242  				b |= pbits << nb
  1243  				nb += endnb
  1244  			}
  1245  			nb -= 8 // for next iteration
  1246  		} else {
  1247  			// Slow path: reached end of ptrmask.
  1248  			// Process final partial byte and rewind to start.
  1249  			b |= uintptr(*p) << nb
  1250  			nb += endnb
  1251  			if nb < 8 {
  1252  				b |= uintptr(*ptrmask) << nb
  1253  				p = add1(ptrmask)
  1254  			} else {
  1255  				nb -= 8
  1256  				p = ptrmask
  1257  			}
  1258  		}
  1259  
  1260  		// Emit bitmap byte.
  1261  		hb = b & bitPointerAll
  1262  		hb |= bitMarkedAll
  1263  		if w += 4; w >= nw {
  1264  			break
  1265  		}
  1266  		*hbitp = uint8(hb)
  1267  		hbitp = subtract1(hbitp)
  1268  		b >>= 4
  1269  	}
  1270  
  1271  Phase3:
  1272  	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
  1273  	if w > nw {
  1274  		// Counting the 4 entries in hb not yet written to memory,
  1275  		// there are more entries than possible pointer slots.
  1276  		// Discard the excess entries (can't be more than 3).
  1277  		mask := uintptr(1)<<(4-(w-nw)) - 1
  1278  		hb &= mask | mask<<4 // apply mask to both pointer bits and mark bits
  1279  	}
  1280  
  1281  	// Change nw from counting possibly-pointer words to total words in allocation.
  1282  	nw = size / sys.PtrSize
  1283  
  1284  	// Write whole bitmap bytes.
  1285  	// The first is hb, the rest are zero.
  1286  	if w <= nw {
  1287  		*hbitp = uint8(hb)
  1288  		hbitp = subtract1(hbitp)
  1289  		hb = 0 // for possible final half-byte below
  1290  		for w += 4; w <= nw; w += 4 {
  1291  			*hbitp = 0
  1292  			hbitp = subtract1(hbitp)
  1293  		}
  1294  	}
  1295  
  1296  	// Write final partial bitmap byte if any.
  1297  	// We know w > nw, or else we'd still be in the loop above.
  1298  	// It can be bigger only due to the 4 entries in hb that it counts.
  1299  	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
  1300  	// and can discard the 4 sitting in hb.
  1301  	// But if w == nw+2, we need to write first two in hb.
  1302  	// The byte is shared with the next object so we may need an atomic.
  1303  	if w == nw+2 {
  1304  		if gcphase == _GCoff {
  1305  			*hbitp = *hbitp&^(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift) | uint8(hb)
  1306  		} else {
  1307  			atomic.And8(hbitp, ^uint8(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift))
  1308  			atomic.Or8(hbitp, uint8(hb))
  1309  		}
  1310  	}
  1311  
  1312  Phase4:
  1313  	// Phase 4: all done, but perhaps double check.
  1314  	if doubleCheck {
  1315  		end := heapBitsForAddr(x + size)
  1316  		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
  1317  			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
  1318  			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
  1319  			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
  1320  			h0 := heapBitsForAddr(x)
  1321  			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
  1322  			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
  1323  			throw("bad heapBitsSetType")
  1324  		}
  1325  
  1326  		// Double-check that bits to be written were written correctly.
  1327  		// Does not check that other bits were not written, unfortunately.
  1328  		h := heapBitsForAddr(x)
  1329  		nptr := typ.ptrdata / sys.PtrSize
  1330  		ndata := typ.size / sys.PtrSize
  1331  		count := dataSize / typ.size
  1332  		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
  1333  		for i := uintptr(0); i < size/sys.PtrSize; i++ {
  1334  			j := i % ndata
  1335  			var have, want uint8
  1336  			have = (*h.bitp >> h.shift) & (bitPointer | bitMarked)
  1337  			if i >= totalptr {
  1338  				want = 0 // deadmarker
  1339  				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
  1340  					want = bitMarked
  1341  				}
  1342  			} else {
  1343  				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
  1344  					want |= bitPointer
  1345  				}
  1346  				if i != 1 {
  1347  					want |= bitMarked
  1348  				} else {
  1349  					have &^= bitMarked
  1350  				}
  1351  			}
  1352  			if have != want {
  1353  				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
  1354  				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
  1355  				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
  1356  				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
  1357  				h0 := heapBitsForAddr(x)
  1358  				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
  1359  				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
  1360  				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
  1361  				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
  1362  				if typ.kind&kindGCProg != 0 {
  1363  					println("GC program:")
  1364  					dumpGCProg(addb(typ.gcdata, 4))
  1365  				}
  1366  				throw("bad heapBitsSetType")
  1367  			}
  1368  			h = h.next()
  1369  		}
  1370  		if ptrmask == debugPtrmask.data {
  1371  			unlock(&debugPtrmask.lock)
  1372  		}
  1373  	}
  1374  }
  1375  
  1376  // heapBitsSetTypeNoScan marks x as noscan by setting the first word
  1377  // of x in the heap bitmap to scalar/dead.
  1378  func heapBitsSetTypeNoScan(x uintptr) {
  1379  	h := heapBitsForAddr(uintptr(x))
  1380  	*h.bitp &^= (bitPointer | bitMarked) << h.shift
  1381  }
  1382  
  1383  var debugPtrmask struct {
  1384  	lock mutex
  1385  	data *byte
  1386  }
  1387  
  1388  // heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
  1389  // progSize is the size of the memory described by the program.
  1390  // elemSize is the size of the element that the GC program describes (a prefix of).
  1391  // dataSize is the total size of the intended data, a multiple of elemSize.
  1392  // allocSize is the total size of the allocated memory.
  1393  //
  1394  // GC programs are only used for large allocations.
  1395  // heapBitsSetType requires that allocSize is a multiple of 4 words,
  1396  // so that the relevant bitmap bytes are not shared with surrounding
  1397  // objects and need not be accessed with atomic instructions.
  1398  func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
  1399  	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
  1400  		// Alignment will be wrong.
  1401  		throw("heapBitsSetTypeGCProg: small allocation")
  1402  	}
  1403  	var totalBits uintptr
  1404  	if elemSize == dataSize {
  1405  		totalBits = runGCProg(prog, nil, h.bitp, 2)
  1406  		if totalBits*sys.PtrSize != progSize {
  1407  			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
  1408  			throw("heapBitsSetTypeGCProg: unexpected bit count")
  1409  		}
  1410  	} else {
  1411  		count := dataSize / elemSize
  1412  
  1413  		// Piece together program trailer to run after prog that does:
  1414  		//	literal(0)
  1415  		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
  1416  		//	repeat(elemSize, count-1) // repeat that element for count
  1417  		// This zero-pads the data remaining in the first element and then
  1418  		// repeats that first element to fill the array.
  1419  		var trailer [40]byte // 3 varints (max 10 each) + some bytes
  1420  		i := 0
  1421  		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
  1422  			// literal(0)
  1423  			trailer[i] = 0x01
  1424  			i++
  1425  			trailer[i] = 0
  1426  			i++
  1427  			if n > 1 {
  1428  				// repeat(1, n-1)
  1429  				trailer[i] = 0x81
  1430  				i++
  1431  				n--
  1432  				for ; n >= 0x80; n >>= 7 {
  1433  					trailer[i] = byte(n | 0x80)
  1434  					i++
  1435  				}
  1436  				trailer[i] = byte(n)
  1437  				i++
  1438  			}
  1439  		}
  1440  		// repeat(elemSize/ptrSize, count-1)
  1441  		trailer[i] = 0x80
  1442  		i++
  1443  		n := elemSize / sys.PtrSize
  1444  		for ; n >= 0x80; n >>= 7 {
  1445  			trailer[i] = byte(n | 0x80)
  1446  			i++
  1447  		}
  1448  		trailer[i] = byte(n)
  1449  		i++
  1450  		n = count - 1
  1451  		for ; n >= 0x80; n >>= 7 {
  1452  			trailer[i] = byte(n | 0x80)
  1453  			i++
  1454  		}
  1455  		trailer[i] = byte(n)
  1456  		i++
  1457  		trailer[i] = 0
  1458  		i++
  1459  
  1460  		runGCProg(prog, &trailer[0], h.bitp, 2)
  1461  
  1462  		// Even though we filled in the full array just now,
  1463  		// record that we only filled in up to the ptrdata of the
  1464  		// last element. This will cause the code below to
  1465  		// memclr the dead section of the final array element,
  1466  		// so that scanobject can stop early in the final element.
  1467  		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
  1468  	}
  1469  	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
  1470  	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
  1471  	memclr(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
  1472  }
  1473  
  1474  // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
  1475  // size the size of the region described by prog, in bytes.
  1476  // The resulting bitvector will have no more than size/sys.PtrSize bits.
  1477  func progToPointerMask(prog *byte, size uintptr) bitvector {
  1478  	n := (size/sys.PtrSize + 7) / 8
  1479  	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
  1480  	x[len(x)-1] = 0xa1 // overflow check sentinel
  1481  	n = runGCProg(prog, nil, &x[0], 1)
  1482  	if x[len(x)-1] != 0xa1 {
  1483  		throw("progToPointerMask: overflow")
  1484  	}
  1485  	return bitvector{int32(n), &x[0]}
  1486  }
  1487  
  1488  // Packed GC pointer bitmaps, aka GC programs.
  1489  //
  1490  // For large types containing arrays, the type information has a
  1491  // natural repetition that can be encoded to save space in the
  1492  // binary and in the memory representation of the type information.
  1493  //
  1494  // The encoding is a simple Lempel-Ziv style bytecode machine
  1495  // with the following instructions:
  1496  //
  1497  //	00000000: stop
  1498  //	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
  1499  //	10000000 n c: repeat the previous n bits c times; n, c are varints
  1500  //	1nnnnnnn c: repeat the previous n bits c times; c is a varint
  1501  
  1502  // runGCProg executes the GC program prog, and then trailer if non-nil,
  1503  // writing to dst with entries of the given size.
  1504  // If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
  1505  // If size == 2, dst is the 2-bit heap bitmap, and writes move backward
  1506  // starting at dst (because the heap bitmap does). In this case, the caller guarantees
  1507  // that only whole bytes in dst need to be written.
  1508  //
  1509  // runGCProg returns the number of 1- or 2-bit entries written to memory.
  1510  func runGCProg(prog, trailer, dst *byte, size int) uintptr {
  1511  	dstStart := dst
  1512  
  1513  	// Bits waiting to be written to memory.
  1514  	var bits uintptr
  1515  	var nbits uintptr
  1516  
  1517  	p := prog
  1518  Run:
  1519  	for {
  1520  		// Flush accumulated full bytes.
  1521  		// The rest of the loop assumes that nbits <= 7.
  1522  		for ; nbits >= 8; nbits -= 8 {
  1523  			if size == 1 {
  1524  				*dst = uint8(bits)
  1525  				dst = add1(dst)
  1526  				bits >>= 8
  1527  			} else {
  1528  				v := bits&bitPointerAll | bitMarkedAll
  1529  				*dst = uint8(v)
  1530  				dst = subtract1(dst)
  1531  				bits >>= 4
  1532  				v = bits&bitPointerAll | bitMarkedAll
  1533  				*dst = uint8(v)
  1534  				dst = subtract1(dst)
  1535  				bits >>= 4
  1536  			}
  1537  		}
  1538  
  1539  		// Process one instruction.
  1540  		inst := uintptr(*p)
  1541  		p = add1(p)
  1542  		n := inst & 0x7F
  1543  		if inst&0x80 == 0 {
  1544  			// Literal bits; n == 0 means end of program.
  1545  			if n == 0 {
  1546  				// Program is over; continue in trailer if present.
  1547  				if trailer != nil {
  1548  					//println("trailer")
  1549  					p = trailer
  1550  					trailer = nil
  1551  					continue
  1552  				}
  1553  				//println("done")
  1554  				break Run
  1555  			}
  1556  			//println("lit", n, dst)
  1557  			nbyte := n / 8
  1558  			for i := uintptr(0); i < nbyte; i++ {
  1559  				bits |= uintptr(*p) << nbits
  1560  				p = add1(p)
  1561  				if size == 1 {
  1562  					*dst = uint8(bits)
  1563  					dst = add1(dst)
  1564  					bits >>= 8
  1565  				} else {
  1566  					v := bits&0xf | bitMarkedAll
  1567  					*dst = uint8(v)
  1568  					dst = subtract1(dst)
  1569  					bits >>= 4
  1570  					v = bits&0xf | bitMarkedAll
  1571  					*dst = uint8(v)
  1572  					dst = subtract1(dst)
  1573  					bits >>= 4
  1574  				}
  1575  			}
  1576  			if n %= 8; n > 0 {
  1577  				bits |= uintptr(*p) << nbits
  1578  				p = add1(p)
  1579  				nbits += n
  1580  			}
  1581  			continue Run
  1582  		}
  1583  
  1584  		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
  1585  		if n == 0 {
  1586  			for off := uint(0); ; off += 7 {
  1587  				x := uintptr(*p)
  1588  				p = add1(p)
  1589  				n |= (x & 0x7F) << off
  1590  				if x&0x80 == 0 {
  1591  					break
  1592  				}
  1593  			}
  1594  		}
  1595  
  1596  		// Count is encoded in a varint in the next bytes.
  1597  		c := uintptr(0)
  1598  		for off := uint(0); ; off += 7 {
  1599  			x := uintptr(*p)
  1600  			p = add1(p)
  1601  			c |= (x & 0x7F) << off
  1602  			if x&0x80 == 0 {
  1603  				break
  1604  			}
  1605  		}
  1606  		c *= n // now total number of bits to copy
  1607  
  1608  		// If the number of bits being repeated is small, load them
  1609  		// into a register and use that register for the entire loop
  1610  		// instead of repeatedly reading from memory.
  1611  		// Handling fewer than 8 bits here makes the general loop simpler.
  1612  		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
  1613  		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
  1614  		// it will not overflow.
  1615  		src := dst
  1616  		const maxBits = sys.PtrSize*8 - 7
  1617  		if n <= maxBits {
  1618  			// Start with bits in output buffer.
  1619  			pattern := bits
  1620  			npattern := nbits
  1621  
  1622  			// If we need more bits, fetch them from memory.
  1623  			if size == 1 {
  1624  				src = subtract1(src)
  1625  				for npattern < n {
  1626  					pattern <<= 8
  1627  					pattern |= uintptr(*src)
  1628  					src = subtract1(src)
  1629  					npattern += 8
  1630  				}
  1631  			} else {
  1632  				src = add1(src)
  1633  				for npattern < n {
  1634  					pattern <<= 4
  1635  					pattern |= uintptr(*src) & 0xf
  1636  					src = add1(src)
  1637  					npattern += 4
  1638  				}
  1639  			}
  1640  
  1641  			// We started with the whole bit output buffer,
  1642  			// and then we loaded bits from whole bytes.
  1643  			// Either way, we might now have too many instead of too few.
  1644  			// Discard the extra.
  1645  			if npattern > n {
  1646  				pattern >>= npattern - n
  1647  				npattern = n
  1648  			}
  1649  
  1650  			// Replicate pattern to at most maxBits.
  1651  			if npattern == 1 {
  1652  				// One bit being repeated.
  1653  				// If the bit is 1, make the pattern all 1s.
  1654  				// If the bit is 0, the pattern is already all 0s,
  1655  				// but we can claim that the number of bits
  1656  				// in the word is equal to the number we need (c),
  1657  				// because right shift of bits will zero fill.
  1658  				if pattern == 1 {
  1659  					pattern = 1<<maxBits - 1
  1660  					npattern = maxBits
  1661  				} else {
  1662  					npattern = c
  1663  				}
  1664  			} else {
  1665  				b := pattern
  1666  				nb := npattern
  1667  				if nb+nb <= maxBits {
  1668  					// Double pattern until the whole uintptr is filled.
  1669  					for nb <= sys.PtrSize*8 {
  1670  						b |= b << nb
  1671  						nb += nb
  1672  					}
  1673  					// Trim away incomplete copy of original pattern in high bits.
  1674  					// TODO(rsc): Replace with table lookup or loop on systems without divide?
  1675  					nb = maxBits / npattern * npattern
  1676  					b &= 1<<nb - 1
  1677  					pattern = b
  1678  					npattern = nb
  1679  				}
  1680  			}
  1681  
  1682  			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
  1683  			// Since pattern contains >8 bits, there will be full bytes to flush
  1684  			// on each iteration.
  1685  			for ; c >= npattern; c -= npattern {
  1686  				bits |= pattern << nbits
  1687  				nbits += npattern
  1688  				if size == 1 {
  1689  					for nbits >= 8 {
  1690  						*dst = uint8(bits)
  1691  						dst = add1(dst)
  1692  						bits >>= 8
  1693  						nbits -= 8
  1694  					}
  1695  				} else {
  1696  					for nbits >= 4 {
  1697  						*dst = uint8(bits&0xf | bitMarkedAll)
  1698  						dst = subtract1(dst)
  1699  						bits >>= 4
  1700  						nbits -= 4
  1701  					}
  1702  				}
  1703  			}
  1704  
  1705  			// Add final fragment to bit buffer.
  1706  			if c > 0 {
  1707  				pattern &= 1<<c - 1
  1708  				bits |= pattern << nbits
  1709  				nbits += c
  1710  			}
  1711  			continue Run
  1712  		}
  1713  
  1714  		// Repeat; n too large to fit in a register.
  1715  		// Since nbits <= 7, we know the first few bytes of repeated data
  1716  		// are already written to memory.
  1717  		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
  1718  		if size == 1 {
  1719  			// Leading src fragment.
  1720  			src = subtractb(src, (off+7)/8)
  1721  			if frag := off & 7; frag != 0 {
  1722  				bits |= uintptr(*src) >> (8 - frag) << nbits
  1723  				src = add1(src)
  1724  				nbits += frag
  1725  				c -= frag
  1726  			}
  1727  			// Main loop: load one byte, write another.
  1728  			// The bits are rotating through the bit buffer.
  1729  			for i := c / 8; i > 0; i-- {
  1730  				bits |= uintptr(*src) << nbits
  1731  				src = add1(src)
  1732  				*dst = uint8(bits)
  1733  				dst = add1(dst)
  1734  				bits >>= 8
  1735  			}
  1736  			// Final src fragment.
  1737  			if c %= 8; c > 0 {
  1738  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
  1739  				nbits += c
  1740  			}
  1741  		} else {
  1742  			// Leading src fragment.
  1743  			src = addb(src, (off+3)/4)
  1744  			if frag := off & 3; frag != 0 {
  1745  				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
  1746  				src = subtract1(src)
  1747  				nbits += frag
  1748  				c -= frag
  1749  			}
  1750  			// Main loop: load one byte, write another.
  1751  			// The bits are rotating through the bit buffer.
  1752  			for i := c / 4; i > 0; i-- {
  1753  				bits |= (uintptr(*src) & 0xf) << nbits
  1754  				src = subtract1(src)
  1755  				*dst = uint8(bits&0xf | bitMarkedAll)
  1756  				dst = subtract1(dst)
  1757  				bits >>= 4
  1758  			}
  1759  			// Final src fragment.
  1760  			if c %= 4; c > 0 {
  1761  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
  1762  				nbits += c
  1763  			}
  1764  		}
  1765  	}
  1766  
  1767  	// Write any final bits out, using full-byte writes, even for the final byte.
  1768  	var totalBits uintptr
  1769  	if size == 1 {
  1770  		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
  1771  		nbits += -nbits & 7
  1772  		for ; nbits > 0; nbits -= 8 {
  1773  			*dst = uint8(bits)
  1774  			dst = add1(dst)
  1775  			bits >>= 8
  1776  		}
  1777  	} else {
  1778  		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
  1779  		nbits += -nbits & 3
  1780  		for ; nbits > 0; nbits -= 4 {
  1781  			v := bits&0xf | bitMarkedAll
  1782  			*dst = uint8(v)
  1783  			dst = subtract1(dst)
  1784  			bits >>= 4
  1785  		}
  1786  	}
  1787  	return totalBits
  1788  }
  1789  
  1790  func dumpGCProg(p *byte) {
  1791  	nptr := 0
  1792  	for {
  1793  		x := *p
  1794  		p = add1(p)
  1795  		if x == 0 {
  1796  			print("\t", nptr, " end\n")
  1797  			break
  1798  		}
  1799  		if x&0x80 == 0 {
  1800  			print("\t", nptr, " lit ", x, ":")
  1801  			n := int(x+7) / 8
  1802  			for i := 0; i < n; i++ {
  1803  				print(" ", hex(*p))
  1804  				p = add1(p)
  1805  			}
  1806  			print("\n")
  1807  			nptr += int(x)
  1808  		} else {
  1809  			nbit := int(x &^ 0x80)
  1810  			if nbit == 0 {
  1811  				for nb := uint(0); ; nb += 7 {
  1812  					x := *p
  1813  					p = add1(p)
  1814  					nbit |= int(x&0x7f) << nb
  1815  					if x&0x80 == 0 {
  1816  						break
  1817  					}
  1818  				}
  1819  			}
  1820  			count := 0
  1821  			for nb := uint(0); ; nb += 7 {
  1822  				x := *p
  1823  				p = add1(p)
  1824  				count |= int(x&0x7f) << nb
  1825  				if x&0x80 == 0 {
  1826  					break
  1827  				}
  1828  			}
  1829  			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
  1830  			nptr += nbit * count
  1831  		}
  1832  	}
  1833  }
  1834  
  1835  // Testing.
  1836  
  1837  func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
  1838  	target := (*stkframe)(ctxt)
  1839  	if frame.sp <= target.sp && target.sp < frame.varp {
  1840  		*target = *frame
  1841  		return false
  1842  	}
  1843  	return true
  1844  }
  1845  
  1846  // gcbits returns the GC type info for x, for testing.
  1847  // The result is the bitmap entries (0 or 1), one entry per byte.
  1848  //go:linkname reflect_gcbits reflect.gcbits
  1849  func reflect_gcbits(x interface{}) []byte {
  1850  	ret := getgcmask(x)
  1851  	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
  1852  	nptr := typ.ptrdata / sys.PtrSize
  1853  	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
  1854  		ret = ret[:len(ret)-1]
  1855  	}
  1856  	return ret
  1857  }
  1858  
  1859  // Returns GC type info for object p for testing.
  1860  func getgcmask(ep interface{}) (mask []byte) {
  1861  	e := *efaceOf(&ep)
  1862  	p := e.data
  1863  	t := e._type
  1864  	// data or bss
  1865  	for datap := &firstmoduledata; datap != nil; datap = datap.next {
  1866  		// data
  1867  		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
  1868  			bitmap := datap.gcdatamask.bytedata
  1869  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1870  			mask = make([]byte, n/sys.PtrSize)
  1871  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1872  				off := (uintptr(p) + i - datap.data) / sys.PtrSize
  1873  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1874  			}
  1875  			return
  1876  		}
  1877  
  1878  		// bss
  1879  		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
  1880  			bitmap := datap.gcbssmask.bytedata
  1881  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1882  			mask = make([]byte, n/sys.PtrSize)
  1883  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1884  				off := (uintptr(p) + i - datap.bss) / sys.PtrSize
  1885  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1886  			}
  1887  			return
  1888  		}
  1889  	}
  1890  
  1891  	// heap
  1892  	var n uintptr
  1893  	var base uintptr
  1894  	if mlookup(uintptr(p), &base, &n, nil) != 0 {
  1895  		mask = make([]byte, n/sys.PtrSize)
  1896  		for i := uintptr(0); i < n; i += sys.PtrSize {
  1897  			hbits := heapBitsForAddr(base + i)
  1898  			if hbits.isPointer() {
  1899  				mask[i/sys.PtrSize] = 1
  1900  			}
  1901  			if i != 1*sys.PtrSize && !hbits.morePointers() {
  1902  				mask = mask[:i/sys.PtrSize]
  1903  				break
  1904  			}
  1905  		}
  1906  		return
  1907  	}
  1908  
  1909  	// stack
  1910  	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
  1911  		var frame stkframe
  1912  		frame.sp = uintptr(p)
  1913  		_g_ := getg()
  1914  		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
  1915  		if frame.fn != nil {
  1916  			f := frame.fn
  1917  			targetpc := frame.continpc
  1918  			if targetpc == 0 {
  1919  				return
  1920  			}
  1921  			// SPARC64's PC holds the address of the *current* instruction.
  1922  			if targetpc != f.entry && sys.GoarchSparc64 == 0 {
  1923  				targetpc--
  1924  			}
  1925  			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
  1926  			if pcdata == -1 {
  1927  				return
  1928  			}
  1929  			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
  1930  			if stkmap == nil || stkmap.n <= 0 {
  1931  				return
  1932  			}
  1933  			bv := stackmapdata(stkmap, pcdata)
  1934  			size := uintptr(bv.n) * sys.PtrSize
  1935  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1936  			mask = make([]byte, n/sys.PtrSize)
  1937  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1938  				bitmap := bv.bytedata
  1939  				off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
  1940  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1941  			}
  1942  		}
  1943  		return
  1944  	}
  1945  
  1946  	// otherwise, not something the GC knows about.
  1947  	// possibly read-only data, like malloc(0).
  1948  	// must not have pointers
  1949  	return
  1950  }