github.com/fenixara/go@v0.0.0-20170127160404-96ea0918e670/src/runtime/mbitmap.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Garbage collector: type and heap bitmaps.
     6  //
     7  // Stack, data, and bss bitmaps
     8  //
     9  // Stack frames and global variables in the data and bss sections are described
    10  // by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
    11  // to be visited during GC. The bits in each byte are consumed starting with
    12  // the low bit: 1<<0, 1<<1, and so on.
    13  //
    14  // Heap bitmap
    15  //
    16  // The allocated heap comes from a subset of the memory in the range [start, used),
    17  // where start == mheap_.arena_start and used == mheap_.arena_used.
    18  // The heap bitmap comprises 2 bits for each pointer-sized word in that range,
    19  // stored in bytes indexed backward in memory from start.
    20  // That is, the byte at address start-1 holds the 2-bit entries for the four words
    21  // start through start+3*ptrSize, the byte at start-2 holds the entries for
    22  // start+4*ptrSize through start+7*ptrSize, and so on.
    23  //
    24  // In each 2-bit entry, the lower bit holds the same information as in the 1-bit
    25  // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
    26  // The meaning of the high bit depends on the position of the word being described
    27  // in its allocated object. In all words *except* the second word, the
    28  // high bit indicates that the object is still being described. In
    29  // these words, if a bit pair with a high bit 0 is encountered, the
    30  // low bit can also be assumed to be 0, and the object description is
    31  // over. This 00 is called the ``dead'' encoding: it signals that the
    32  // rest of the words in the object are uninteresting to the garbage
    33  // collector.
    34  //
    35  // In the second word, the high bit is the GC ``checkmarked'' bit (see below).
    36  //
    37  // The 2-bit entries are split when written into the byte, so that the top half
    38  // of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
    39  // bits.
    40  // This form allows a copy from the 1-bit to the 4-bit form to keep the
    41  // pointer bits contiguous, instead of having to space them out.
    42  //
    43  // The code makes use of the fact that the zero value for a heap bitmap
    44  // has no live pointer bit set and is (depending on position), not used,
    45  // not checkmarked, and is the dead encoding.
    46  // These properties must be preserved when modifying the encoding.
    47  //
    48  // Checkmarks
    49  //
    50  // In a concurrent garbage collector, one worries about failing to mark
    51  // a live object due to mutations without write barriers or bugs in the
    52  // collector implementation. As a sanity check, the GC has a 'checkmark'
    53  // mode that retraverses the object graph with the world stopped, to make
    54  // sure that everything that should be marked is marked.
    55  // In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
    56  // for the second word of the object holds the checkmark bit.
    57  // When not in checkmark mode, this bit is set to 1.
    58  //
    59  // The smallest possible allocation is 8 bytes. On a 32-bit machine, that
    60  // means every allocated object has two words, so there is room for the
    61  // checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
    62  // just one word, so the second bit pair is not available for encoding the
    63  // checkmark. However, because non-pointer allocations are combined
    64  // into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
    65  // must be a pointer, so the type bit in the first word is not actually needed.
    66  // It is still used in general, except in checkmark the type bit is repurposed
    67  // as the checkmark bit and then reinitialized (to 1) as the type bit when
    68  // finished.
    69  //
    70  
    71  package runtime
    72  
    73  import (
    74  	"runtime/internal/atomic"
    75  	"runtime/internal/sys"
    76  	"unsafe"
    77  )
    78  
    79  const (
    80  	bitPointer = 1 << 0
    81  	bitScan    = 1 << 4
    82  
    83  	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
    84  	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
    85  
    86  	// all scan/pointer bits in a byte
    87  	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
    88  	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
    89  )
    90  
    91  // addb returns the byte pointer p+n.
    92  //go:nowritebarrier
    93  //go:nosplit
    94  func addb(p *byte, n uintptr) *byte {
    95  	// Note: wrote out full expression instead of calling add(p, n)
    96  	// to reduce the number of temporaries generated by the
    97  	// compiler for this trivial expression during inlining.
    98  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
    99  }
   100  
   101  // subtractb returns the byte pointer p-n.
   102  // subtractb is typically used when traversing the pointer tables referred to by hbits
   103  // which are arranged in reverse order.
   104  //go:nowritebarrier
   105  //go:nosplit
   106  func subtractb(p *byte, n uintptr) *byte {
   107  	// Note: wrote out full expression instead of calling add(p, -n)
   108  	// to reduce the number of temporaries generated by the
   109  	// compiler for this trivial expression during inlining.
   110  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
   111  }
   112  
   113  // add1 returns the byte pointer p+1.
   114  //go:nowritebarrier
   115  //go:nosplit
   116  func add1(p *byte) *byte {
   117  	// Note: wrote out full expression instead of calling addb(p, 1)
   118  	// to reduce the number of temporaries generated by the
   119  	// compiler for this trivial expression during inlining.
   120  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
   121  }
   122  
   123  // subtract1 returns the byte pointer p-1.
   124  // subtract1 is typically used when traversing the pointer tables referred to by hbits
   125  // which are arranged in reverse order.
   126  //go:nowritebarrier
   127  //
   128  // nosplit because it is used during write barriers and must not be preempted.
   129  //go:nosplit
   130  func subtract1(p *byte) *byte {
   131  	// Note: wrote out full expression instead of calling subtractb(p, 1)
   132  	// to reduce the number of temporaries generated by the
   133  	// compiler for this trivial expression during inlining.
   134  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
   135  }
   136  
   137  // mHeap_MapBits is called each time arena_used is extended.
   138  // It maps any additional bitmap memory needed for the new arena memory.
   139  // It must be called with the expected new value of arena_used,
   140  // *before* h.arena_used has been updated.
   141  // Waiting to update arena_used until after the memory has been mapped
   142  // avoids faults when other threads try access the bitmap immediately
   143  // after observing the change to arena_used.
   144  //
   145  //go:nowritebarrier
   146  func (h *mheap) mapBits(arena_used uintptr) {
   147  	// Caller has added extra mappings to the arena.
   148  	// Add extra mappings of bitmap words as needed.
   149  	// We allocate extra bitmap pieces in chunks of bitmapChunk.
   150  	const bitmapChunk = 8192
   151  
   152  	n := (arena_used - mheap_.arena_start) / heapBitmapScale
   153  	n = round(n, bitmapChunk)
   154  	n = round(n, physPageSize)
   155  	if h.bitmap_mapped >= n {
   156  		return
   157  	}
   158  
   159  	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
   160  	h.bitmap_mapped = n
   161  }
   162  
   163  // heapBits provides access to the bitmap bits for a single heap word.
   164  // The methods on heapBits take value receivers so that the compiler
   165  // can more easily inline calls to those methods and registerize the
   166  // struct fields independently.
   167  type heapBits struct {
   168  	bitp  *uint8
   169  	shift uint32
   170  }
   171  
   172  // markBits provides access to the mark bit for an object in the heap.
   173  // bytep points to the byte holding the mark bit.
   174  // mask is a byte with a single bit set that can be &ed with *bytep
   175  // to see if the bit has been set.
   176  // *m.byte&m.mask != 0 indicates the mark bit is set.
   177  // index can be used along with span information to generate
   178  // the address of the object in the heap.
   179  // We maintain one set of mark bits for allocation and one for
   180  // marking purposes.
   181  type markBits struct {
   182  	bytep *uint8
   183  	mask  uint8
   184  	index uintptr
   185  }
   186  
   187  //go:nosplit
   188  func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
   189  	whichByte := allocBitIndex / 8
   190  	whichBit := allocBitIndex % 8
   191  	bytePtr := addb(s.allocBits, whichByte)
   192  	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
   193  }
   194  
   195  // refillaCache takes 8 bytes s.allocBits starting at whichByte
   196  // and negates them so that ctz (count trailing zeros) instructions
   197  // can be used. It then places these 8 bytes into the cached 64 bit
   198  // s.allocCache.
   199  func (s *mspan) refillAllocCache(whichByte uintptr) {
   200  	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
   201  	aCache := uint64(0)
   202  	aCache |= uint64(bytes[0])
   203  	aCache |= uint64(bytes[1]) << (1 * 8)
   204  	aCache |= uint64(bytes[2]) << (2 * 8)
   205  	aCache |= uint64(bytes[3]) << (3 * 8)
   206  	aCache |= uint64(bytes[4]) << (4 * 8)
   207  	aCache |= uint64(bytes[5]) << (5 * 8)
   208  	aCache |= uint64(bytes[6]) << (6 * 8)
   209  	aCache |= uint64(bytes[7]) << (7 * 8)
   210  	s.allocCache = ^aCache
   211  }
   212  
   213  // nextFreeIndex returns the index of the next free object in s at
   214  // or after s.freeindex.
   215  // There are hardware instructions that can be used to make this
   216  // faster if profiling warrants it.
   217  func (s *mspan) nextFreeIndex() uintptr {
   218  	sfreeindex := s.freeindex
   219  	snelems := s.nelems
   220  	if sfreeindex == snelems {
   221  		return sfreeindex
   222  	}
   223  	if sfreeindex > snelems {
   224  		throw("s.freeindex > s.nelems")
   225  	}
   226  
   227  	aCache := s.allocCache
   228  
   229  	bitIndex := sys.Ctz64(aCache)
   230  	for bitIndex == 64 {
   231  		// Move index to start of next cached bits.
   232  		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
   233  		if sfreeindex >= snelems {
   234  			s.freeindex = snelems
   235  			return snelems
   236  		}
   237  		whichByte := sfreeindex / 8
   238  		// Refill s.allocCache with the next 64 alloc bits.
   239  		s.refillAllocCache(whichByte)
   240  		aCache = s.allocCache
   241  		bitIndex = sys.Ctz64(aCache)
   242  		// nothing available in cached bits
   243  		// grab the next 8 bytes and try again.
   244  	}
   245  	result := sfreeindex + uintptr(bitIndex)
   246  	if result >= snelems {
   247  		s.freeindex = snelems
   248  		return snelems
   249  	}
   250  
   251  	s.allocCache >>= (bitIndex + 1)
   252  	sfreeindex = result + 1
   253  
   254  	if sfreeindex%64 == 0 && sfreeindex != snelems {
   255  		// We just incremented s.freeindex so it isn't 0.
   256  		// As each 1 in s.allocCache was encountered and used for allocation
   257  		// it was shifted away. At this point s.allocCache contains all 0s.
   258  		// Refill s.allocCache so that it corresponds
   259  		// to the bits at s.allocBits starting at s.freeindex.
   260  		whichByte := sfreeindex / 8
   261  		s.refillAllocCache(whichByte)
   262  	}
   263  	s.freeindex = sfreeindex
   264  	return result
   265  }
   266  
   267  // isFree returns whether the index'th object in s is unallocated.
   268  func (s *mspan) isFree(index uintptr) bool {
   269  	if index < s.freeindex {
   270  		return false
   271  	}
   272  	whichByte := index / 8
   273  	whichBit := index % 8
   274  	byteVal := *addb(s.allocBits, whichByte)
   275  	return byteVal&uint8(1<<whichBit) == 0
   276  }
   277  
   278  func (s *mspan) objIndex(p uintptr) uintptr {
   279  	byteOffset := p - s.base()
   280  	if byteOffset == 0 {
   281  		return 0
   282  	}
   283  	if s.baseMask != 0 {
   284  		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
   285  		return byteOffset >> s.divShift
   286  	}
   287  	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
   288  }
   289  
   290  func markBitsForAddr(p uintptr) markBits {
   291  	s := spanOf(p)
   292  	objIndex := s.objIndex(p)
   293  	return s.markBitsForIndex(objIndex)
   294  }
   295  
   296  func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
   297  	whichByte := objIndex / 8
   298  	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
   299  	bytePtr := addb(s.gcmarkBits, whichByte)
   300  	return markBits{bytePtr, bitMask, objIndex}
   301  }
   302  
   303  func (s *mspan) markBitsForBase() markBits {
   304  	return markBits{s.gcmarkBits, uint8(1), 0}
   305  }
   306  
   307  // isMarked reports whether mark bit m is set.
   308  func (m markBits) isMarked() bool {
   309  	return *m.bytep&m.mask != 0
   310  }
   311  
   312  // setMarked sets the marked bit in the markbits, atomically. Some compilers
   313  // are not able to inline atomic.Or8 function so if it appears as a hot spot consider
   314  // inlining it manually.
   315  func (m markBits) setMarked() {
   316  	// Might be racing with other updates, so use atomic update always.
   317  	// We used to be clever here and use a non-atomic update in certain
   318  	// cases, but it's not worth the risk.
   319  	atomic.Or8(m.bytep, m.mask)
   320  }
   321  
   322  // setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
   323  func (m markBits) setMarkedNonAtomic() {
   324  	*m.bytep |= m.mask
   325  }
   326  
   327  // clearMarked clears the marked bit in the markbits, atomically.
   328  func (m markBits) clearMarked() {
   329  	// Might be racing with other updates, so use atomic update always.
   330  	// We used to be clever here and use a non-atomic update in certain
   331  	// cases, but it's not worth the risk.
   332  	atomic.And8(m.bytep, ^m.mask)
   333  }
   334  
   335  // clearMarkedNonAtomic clears the marked bit non-atomically.
   336  func (m markBits) clearMarkedNonAtomic() {
   337  	*m.bytep ^= m.mask
   338  }
   339  
   340  // markBitsForSpan returns the markBits for the span base address base.
   341  func markBitsForSpan(base uintptr) (mbits markBits) {
   342  	if base < mheap_.arena_start || base >= mheap_.arena_used {
   343  		throw("markBitsForSpan: base out of range")
   344  	}
   345  	mbits = markBitsForAddr(base)
   346  	if mbits.mask != 1 {
   347  		throw("markBitsForSpan: unaligned start")
   348  	}
   349  	return mbits
   350  }
   351  
   352  // advance advances the markBits to the next object in the span.
   353  func (m *markBits) advance() {
   354  	if m.mask == 1<<7 {
   355  		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
   356  		m.mask = 1
   357  	} else {
   358  		m.mask = m.mask << 1
   359  	}
   360  	m.index++
   361  }
   362  
   363  // heapBitsForAddr returns the heapBits for the address addr.
   364  // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
   365  //
   366  // nosplit because it is used during write barriers and must not be preempted.
   367  //go:nosplit
   368  func heapBitsForAddr(addr uintptr) heapBits {
   369  	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
   370  	off := (addr - mheap_.arena_start) / sys.PtrSize
   371  	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
   372  }
   373  
   374  // heapBitsForSpan returns the heapBits for the span base address base.
   375  func heapBitsForSpan(base uintptr) (hbits heapBits) {
   376  	if base < mheap_.arena_start || base >= mheap_.arena_used {
   377  		throw("heapBitsForSpan: base out of range")
   378  	}
   379  	return heapBitsForAddr(base)
   380  }
   381  
   382  // heapBitsForObject returns the base address for the heap object
   383  // containing the address p, the heapBits for base,
   384  // the object's span, and of the index of the object in s.
   385  // If p does not point into a heap object,
   386  // return base == 0
   387  // otherwise return the base of the object.
   388  //
   389  // refBase and refOff optionally give the base address of the object
   390  // in which the pointer p was found and the byte offset at which it
   391  // was found. These are used for error reporting.
   392  func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
   393  	arenaStart := mheap_.arena_start
   394  	if p < arenaStart || p >= mheap_.arena_used {
   395  		return
   396  	}
   397  	off := p - arenaStart
   398  	idx := off >> _PageShift
   399  	// p points into the heap, but possibly to the middle of an object.
   400  	// Consult the span table to find the block beginning.
   401  	s = mheap_.spans[idx]
   402  	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
   403  		if s == nil || s.state == _MSpanStack {
   404  			// If s is nil, the virtual address has never been part of the heap.
   405  			// This pointer may be to some mmap'd region, so we allow it.
   406  			// Pointers into stacks are also ok, the runtime manages these explicitly.
   407  			return
   408  		}
   409  
   410  		// The following ensures that we are rigorous about what data
   411  		// structures hold valid pointers.
   412  		if debug.invalidptr != 0 {
   413  			// Typically this indicates an incorrect use
   414  			// of unsafe or cgo to store a bad pointer in
   415  			// the Go heap. It may also indicate a runtime
   416  			// bug.
   417  			//
   418  			// TODO(austin): We could be more aggressive
   419  			// and detect pointers to unallocated objects
   420  			// in allocated spans.
   421  			printlock()
   422  			print("runtime: pointer ", hex(p))
   423  			if s.state != mSpanInUse {
   424  				print(" to unallocated span")
   425  			} else {
   426  				print(" to unused region of span")
   427  			}
   428  			print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
   429  			if refBase != 0 {
   430  				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
   431  				gcDumpObject("object", refBase, refOff)
   432  			}
   433  			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
   434  		}
   435  		return
   436  	}
   437  	// If this span holds object of a power of 2 size, just mask off the bits to
   438  	// the interior of the object. Otherwise use the size to get the base.
   439  	if s.baseMask != 0 {
   440  		// optimize for power of 2 sized objects.
   441  		base = s.base()
   442  		base = base + (p-base)&uintptr(s.baseMask)
   443  		objIndex = (base - s.base()) >> s.divShift
   444  		// base = p & s.baseMask is faster for small spans,
   445  		// but doesn't work for large spans.
   446  		// Overall, it's faster to use the more general computation above.
   447  	} else {
   448  		base = s.base()
   449  		if p-base >= s.elemsize {
   450  			// n := (p - base) / s.elemsize, using division by multiplication
   451  			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
   452  			base += objIndex * s.elemsize
   453  		}
   454  	}
   455  	// Now that we know the actual base, compute heapBits to return to caller.
   456  	hbits = heapBitsForAddr(base)
   457  	return
   458  }
   459  
   460  // prefetch the bits.
   461  func (h heapBits) prefetch() {
   462  	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
   463  }
   464  
   465  // next returns the heapBits describing the next pointer-sized word in memory.
   466  // That is, if h describes address p, h.next() describes p+ptrSize.
   467  // Note that next does not modify h. The caller must record the result.
   468  //
   469  // nosplit because it is used during write barriers and must not be preempted.
   470  //go:nosplit
   471  func (h heapBits) next() heapBits {
   472  	if h.shift < 3*heapBitsShift {
   473  		return heapBits{h.bitp, h.shift + heapBitsShift}
   474  	}
   475  	return heapBits{subtract1(h.bitp), 0}
   476  }
   477  
   478  // forward returns the heapBits describing n pointer-sized words ahead of h in memory.
   479  // That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
   480  // h.forward(1) is equivalent to h.next(), just slower.
   481  // Note that forward does not modify h. The caller must record the result.
   482  // bits returns the heap bits for the current word.
   483  func (h heapBits) forward(n uintptr) heapBits {
   484  	n += uintptr(h.shift) / heapBitsShift
   485  	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
   486  }
   487  
   488  // The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
   489  // The result includes in its higher bits the bits for subsequent words
   490  // described by the same bitmap byte.
   491  func (h heapBits) bits() uint32 {
   492  	// The (shift & 31) eliminates a test and conditional branch
   493  	// from the generated code.
   494  	return uint32(*h.bitp) >> (h.shift & 31)
   495  }
   496  
   497  // morePointers returns true if this word and all remaining words in this object
   498  // are scalars.
   499  // h must not describe the second word of the object.
   500  func (h heapBits) morePointers() bool {
   501  	return h.bits()&bitScan != 0
   502  }
   503  
   504  // isPointer reports whether the heap bits describe a pointer word.
   505  //
   506  // nosplit because it is used during write barriers and must not be preempted.
   507  //go:nosplit
   508  func (h heapBits) isPointer() bool {
   509  	return h.bits()&bitPointer != 0
   510  }
   511  
   512  // hasPointers reports whether the given object has any pointers.
   513  // It must be told how large the object at h is for efficiency.
   514  // h must describe the initial word of the object.
   515  func (h heapBits) hasPointers(size uintptr) bool {
   516  	if size == sys.PtrSize { // 1-word objects are always pointers
   517  		return true
   518  	}
   519  	return (*h.bitp>>h.shift)&bitScan != 0
   520  }
   521  
   522  // isCheckmarked reports whether the heap bits have the checkmarked bit set.
   523  // It must be told how large the object at h is, because the encoding of the
   524  // checkmark bit varies by size.
   525  // h must describe the initial word of the object.
   526  func (h heapBits) isCheckmarked(size uintptr) bool {
   527  	if size == sys.PtrSize {
   528  		return (*h.bitp>>h.shift)&bitPointer != 0
   529  	}
   530  	// All multiword objects are 2-word aligned,
   531  	// so we know that the initial word's 2-bit pair
   532  	// and the second word's 2-bit pair are in the
   533  	// same heap bitmap byte, *h.bitp.
   534  	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
   535  }
   536  
   537  // setCheckmarked sets the checkmarked bit.
   538  // It must be told how large the object at h is, because the encoding of the
   539  // checkmark bit varies by size.
   540  // h must describe the initial word of the object.
   541  func (h heapBits) setCheckmarked(size uintptr) {
   542  	if size == sys.PtrSize {
   543  		atomic.Or8(h.bitp, bitPointer<<h.shift)
   544  		return
   545  	}
   546  	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
   547  }
   548  
   549  // bulkBarrierPreWrite executes writebarrierptr_prewrite1
   550  // for every pointer slot in the memory range [src, src+size),
   551  // using pointer/scalar information from [dst, dst+size).
   552  // This executes the write barriers necessary before a memmove.
   553  // src, dst, and size must be pointer-aligned.
   554  // The range [dst, dst+size) must lie within a single object.
   555  //
   556  // As a special case, src == 0 indicates that this is being used for a
   557  // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
   558  // barrier.
   559  //
   560  // Callers should call bulkBarrierPreWrite immediately before
   561  // calling memmove(dst, src, size). This function is marked nosplit
   562  // to avoid being preempted; the GC must not stop the goroutine
   563  // between the memmove and the execution of the barriers.
   564  // The caller is also responsible for cgo pointer checks if this
   565  // may be writing Go pointers into non-Go memory.
   566  //
   567  // The pointer bitmap is not maintained for allocations containing
   568  // no pointers at all; any caller of bulkBarrierPreWrite must first
   569  // make sure the underlying allocation contains pointers, usually
   570  // by checking typ.kind&kindNoPointers.
   571  //
   572  //go:nosplit
   573  func bulkBarrierPreWrite(dst, src, size uintptr) {
   574  	if (dst|src|size)&(sys.PtrSize-1) != 0 {
   575  		throw("bulkBarrierPreWrite: unaligned arguments")
   576  	}
   577  	if !writeBarrier.needed {
   578  		return
   579  	}
   580  	if !inheap(dst) {
   581  		// If dst is on the stack and in a higher frame than the
   582  		// caller, we either need to execute write barriers on
   583  		// it (which is what happens for normal stack writes
   584  		// through pointers to higher frames), or we need to
   585  		// force the mark termination stack scan to scan the
   586  		// frame containing dst.
   587  		//
   588  		// Executing write barriers on dst is complicated in the
   589  		// general case because we either need to unwind the
   590  		// stack to get the stack map, or we need the type's
   591  		// bitmap, which may be a GC program.
   592  		//
   593  		// Hence, we opt for forcing the re-scan to scan the
   594  		// frame containing dst, which we can do by simply
   595  		// unwinding the stack barriers between the current SP
   596  		// and dst's frame.
   597  		gp := getg().m.curg
   598  		if gp != nil && gp.stack.lo <= dst && dst < gp.stack.hi {
   599  			// Run on the system stack to give it more
   600  			// stack space.
   601  			systemstack(func() {
   602  				gcUnwindBarriers(gp, dst)
   603  			})
   604  			return
   605  		}
   606  
   607  		// If dst is a global, use the data or BSS bitmaps to
   608  		// execute write barriers.
   609  		for _, datap := range activeModules() {
   610  			if datap.data <= dst && dst < datap.edata {
   611  				bulkBarrierBitmap(dst, src, size, dst-datap.data, datap.gcdatamask.bytedata)
   612  				return
   613  			}
   614  		}
   615  		for _, datap := range activeModules() {
   616  			if datap.bss <= dst && dst < datap.ebss {
   617  				bulkBarrierBitmap(dst, src, size, dst-datap.bss, datap.gcbssmask.bytedata)
   618  				return
   619  			}
   620  		}
   621  		return
   622  	}
   623  
   624  	h := heapBitsForAddr(dst)
   625  	if src == 0 {
   626  		for i := uintptr(0); i < size; i += sys.PtrSize {
   627  			if h.isPointer() {
   628  				dstx := (*uintptr)(unsafe.Pointer(dst + i))
   629  				writebarrierptr_prewrite1(dstx, 0)
   630  			}
   631  			h = h.next()
   632  		}
   633  	} else {
   634  		for i := uintptr(0); i < size; i += sys.PtrSize {
   635  			if h.isPointer() {
   636  				dstx := (*uintptr)(unsafe.Pointer(dst + i))
   637  				srcx := (*uintptr)(unsafe.Pointer(src + i))
   638  				writebarrierptr_prewrite1(dstx, *srcx)
   639  			}
   640  			h = h.next()
   641  		}
   642  	}
   643  }
   644  
   645  // bulkBarrierBitmap executes write barriers for copying from [src,
   646  // src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
   647  // assumed to start maskOffset bytes into the data covered by the
   648  // bitmap in bits (which may not be a multiple of 8).
   649  //
   650  // This is used by bulkBarrierPreWrite for writes to data and BSS.
   651  //
   652  //go:nosplit
   653  func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
   654  	word := maskOffset / sys.PtrSize
   655  	bits = addb(bits, word/8)
   656  	mask := uint8(1) << (word % 8)
   657  
   658  	for i := uintptr(0); i < size; i += sys.PtrSize {
   659  		if mask == 0 {
   660  			bits = addb(bits, 1)
   661  			if *bits == 0 {
   662  				// Skip 8 words.
   663  				i += 7 * sys.PtrSize
   664  				continue
   665  			}
   666  			mask = 1
   667  		}
   668  		if *bits&mask != 0 {
   669  			dstx := (*uintptr)(unsafe.Pointer(dst + i))
   670  			if src == 0 {
   671  				writebarrierptr_prewrite1(dstx, 0)
   672  			} else {
   673  				srcx := (*uintptr)(unsafe.Pointer(src + i))
   674  				writebarrierptr_prewrite1(dstx, *srcx)
   675  			}
   676  		}
   677  		mask <<= 1
   678  	}
   679  }
   680  
   681  // typeBitsBulkBarrier executes writebarrierptr_prewrite for every
   682  // pointer that would be copied from [src, src+size) to [dst,
   683  // dst+size) by a memmove using the type bitmap to locate those
   684  // pointer slots.
   685  //
   686  // The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
   687  // dst, src, and size must be pointer-aligned.
   688  // The type typ must have a plain bitmap, not a GC program.
   689  // The only use of this function is in channel sends, and the
   690  // 64 kB channel element limit takes care of this for us.
   691  //
   692  // Must not be preempted because it typically runs right before memmove,
   693  // and the GC must observe them as an atomic action.
   694  //
   695  //go:nosplit
   696  func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
   697  	if typ == nil {
   698  		throw("runtime: typeBitsBulkBarrier without type")
   699  	}
   700  	if typ.size != size {
   701  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
   702  		throw("runtime: invalid typeBitsBulkBarrier")
   703  	}
   704  	if typ.kind&kindGCProg != 0 {
   705  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
   706  		throw("runtime: invalid typeBitsBulkBarrier")
   707  	}
   708  	if !writeBarrier.needed {
   709  		return
   710  	}
   711  	ptrmask := typ.gcdata
   712  	var bits uint32
   713  	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
   714  		if i&(sys.PtrSize*8-1) == 0 {
   715  			bits = uint32(*ptrmask)
   716  			ptrmask = addb(ptrmask, 1)
   717  		} else {
   718  			bits = bits >> 1
   719  		}
   720  		if bits&1 != 0 {
   721  			dstx := (*uintptr)(unsafe.Pointer(dst + i))
   722  			srcx := (*uintptr)(unsafe.Pointer(src + i))
   723  			writebarrierptr_prewrite(dstx, *srcx)
   724  		}
   725  	}
   726  }
   727  
   728  // The methods operating on spans all require that h has been returned
   729  // by heapBitsForSpan and that size, n, total are the span layout description
   730  // returned by the mspan's layout method.
   731  // If total > size*n, it means that there is extra leftover memory in the span,
   732  // usually due to rounding.
   733  //
   734  // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
   735  
   736  // initSpan initializes the heap bitmap for a span.
   737  // It clears all checkmark bits.
   738  // If this is a span of pointer-sized objects, it initializes all
   739  // words to pointer/scan.
   740  // Otherwise, it initializes all words to scalar/dead.
   741  func (h heapBits) initSpan(s *mspan) {
   742  	size, n, total := s.layout()
   743  
   744  	// Init the markbit structures
   745  	s.freeindex = 0
   746  	s.allocCache = ^uint64(0) // all 1s indicating all free.
   747  	s.nelems = n
   748  	s.allocBits = nil
   749  	s.gcmarkBits = nil
   750  	s.gcmarkBits = newMarkBits(s.nelems)
   751  	s.allocBits = newAllocBits(s.nelems)
   752  
   753  	// Clear bits corresponding to objects.
   754  	if total%heapBitmapScale != 0 {
   755  		throw("initSpan: unaligned length")
   756  	}
   757  	nbyte := total / heapBitmapScale
   758  	if sys.PtrSize == 8 && size == sys.PtrSize {
   759  		end := h.bitp
   760  		bitp := subtractb(end, nbyte-1)
   761  		for {
   762  			*bitp = bitPointerAll | bitScanAll
   763  			if bitp == end {
   764  				break
   765  			}
   766  			bitp = add1(bitp)
   767  		}
   768  		return
   769  	}
   770  	memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
   771  }
   772  
   773  // initCheckmarkSpan initializes a span for being checkmarked.
   774  // It clears the checkmark bits, which are set to 1 in normal operation.
   775  func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
   776  	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
   777  	if sys.PtrSize == 8 && size == sys.PtrSize {
   778  		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
   779  		// Only possible on 64-bit system, since minimum size is 8.
   780  		// Must clear type bit (checkmark bit) of every word.
   781  		// The type bit is the lower of every two-bit pair.
   782  		bitp := h.bitp
   783  		for i := uintptr(0); i < n; i += 4 {
   784  			*bitp &^= bitPointerAll
   785  			bitp = subtract1(bitp)
   786  		}
   787  		return
   788  	}
   789  	for i := uintptr(0); i < n; i++ {
   790  		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
   791  		h = h.forward(size / sys.PtrSize)
   792  	}
   793  }
   794  
   795  // clearCheckmarkSpan undoes all the checkmarking in a span.
   796  // The actual checkmark bits are ignored, so the only work to do
   797  // is to fix the pointer bits. (Pointer bits are ignored by scanobject
   798  // but consulted by typedmemmove.)
   799  func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
   800  	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
   801  	if sys.PtrSize == 8 && size == sys.PtrSize {
   802  		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
   803  		// Only possible on 64-bit system, since minimum size is 8.
   804  		// Must clear type bit (checkmark bit) of every word.
   805  		// The type bit is the lower of every two-bit pair.
   806  		bitp := h.bitp
   807  		for i := uintptr(0); i < n; i += 4 {
   808  			*bitp |= bitPointerAll
   809  			bitp = subtract1(bitp)
   810  		}
   811  	}
   812  }
   813  
   814  // oneBitCount is indexed by byte and produces the
   815  // number of 1 bits in that byte. For example 128 has 1 bit set
   816  // and oneBitCount[128] will holds 1.
   817  var oneBitCount = [256]uint8{
   818  	0, 1, 1, 2, 1, 2, 2, 3,
   819  	1, 2, 2, 3, 2, 3, 3, 4,
   820  	1, 2, 2, 3, 2, 3, 3, 4,
   821  	2, 3, 3, 4, 3, 4, 4, 5,
   822  	1, 2, 2, 3, 2, 3, 3, 4,
   823  	2, 3, 3, 4, 3, 4, 4, 5,
   824  	2, 3, 3, 4, 3, 4, 4, 5,
   825  	3, 4, 4, 5, 4, 5, 5, 6,
   826  	1, 2, 2, 3, 2, 3, 3, 4,
   827  	2, 3, 3, 4, 3, 4, 4, 5,
   828  	2, 3, 3, 4, 3, 4, 4, 5,
   829  	3, 4, 4, 5, 4, 5, 5, 6,
   830  	2, 3, 3, 4, 3, 4, 4, 5,
   831  	3, 4, 4, 5, 4, 5, 5, 6,
   832  	3, 4, 4, 5, 4, 5, 5, 6,
   833  	4, 5, 5, 6, 5, 6, 6, 7,
   834  	1, 2, 2, 3, 2, 3, 3, 4,
   835  	2, 3, 3, 4, 3, 4, 4, 5,
   836  	2, 3, 3, 4, 3, 4, 4, 5,
   837  	3, 4, 4, 5, 4, 5, 5, 6,
   838  	2, 3, 3, 4, 3, 4, 4, 5,
   839  	3, 4, 4, 5, 4, 5, 5, 6,
   840  	3, 4, 4, 5, 4, 5, 5, 6,
   841  	4, 5, 5, 6, 5, 6, 6, 7,
   842  	2, 3, 3, 4, 3, 4, 4, 5,
   843  	3, 4, 4, 5, 4, 5, 5, 6,
   844  	3, 4, 4, 5, 4, 5, 5, 6,
   845  	4, 5, 5, 6, 5, 6, 6, 7,
   846  	3, 4, 4, 5, 4, 5, 5, 6,
   847  	4, 5, 5, 6, 5, 6, 6, 7,
   848  	4, 5, 5, 6, 5, 6, 6, 7,
   849  	5, 6, 6, 7, 6, 7, 7, 8}
   850  
   851  // countFree runs through the mark bits in a span and counts the number of free objects
   852  // in the span.
   853  // TODO:(rlh) Use popcount intrinsic.
   854  func (s *mspan) countFree() int {
   855  	count := 0
   856  	maxIndex := s.nelems / 8
   857  	for i := uintptr(0); i < maxIndex; i++ {
   858  		mrkBits := *addb(s.gcmarkBits, i)
   859  		count += int(oneBitCount[mrkBits])
   860  	}
   861  	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
   862  		mrkBits := *addb(s.gcmarkBits, maxIndex)
   863  		mask := uint8((1 << bitsInLastByte) - 1)
   864  		bits := mrkBits & mask
   865  		count += int(oneBitCount[bits])
   866  	}
   867  	return int(s.nelems) - count
   868  }
   869  
   870  // heapBitsSetType records that the new allocation [x, x+size)
   871  // holds in [x, x+dataSize) one or more values of type typ.
   872  // (The number of values is given by dataSize / typ.size.)
   873  // If dataSize < size, the fragment [x+dataSize, x+size) is
   874  // recorded as non-pointer data.
   875  // It is known that the type has pointers somewhere;
   876  // malloc does not call heapBitsSetType when there are no pointers,
   877  // because all free objects are marked as noscan during
   878  // heapBitsSweepSpan.
   879  //
   880  // There can only be one allocation from a given span active at a time,
   881  // and the bitmap for a span always falls on byte boundaries,
   882  // so there are no write-write races for access to the heap bitmap.
   883  // Hence, heapBitsSetType can access the bitmap without atomics.
   884  //
   885  // There can be read-write races between heapBitsSetType and things
   886  // that read the heap bitmap like scanobject. However, since
   887  // heapBitsSetType is only used for objects that have not yet been
   888  // made reachable, readers will ignore bits being modified by this
   889  // function. This does mean this function cannot transiently modify
   890  // bits that belong to neighboring objects. Also, on weakly-ordered
   891  // machines, callers must execute a store/store (publication) barrier
   892  // between calling this function and making the object reachable.
   893  func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
   894  	const doubleCheck = false // slow but helpful; enable to test modifications to this code
   895  
   896  	// dataSize is always size rounded up to the next malloc size class,
   897  	// except in the case of allocating a defer block, in which case
   898  	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
   899  	// arbitrarily larger.
   900  	//
   901  	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
   902  	// assume that dataSize == size without checking it explicitly.
   903  
   904  	if sys.PtrSize == 8 && size == sys.PtrSize {
   905  		// It's one word and it has pointers, it must be a pointer.
   906  		// Since all allocated one-word objects are pointers
   907  		// (non-pointers are aggregated into tinySize allocations),
   908  		// initSpan sets the pointer bits for us. Nothing to do here.
   909  		if doubleCheck {
   910  			h := heapBitsForAddr(x)
   911  			if !h.isPointer() {
   912  				throw("heapBitsSetType: pointer bit missing")
   913  			}
   914  			if !h.morePointers() {
   915  				throw("heapBitsSetType: scan bit missing")
   916  			}
   917  		}
   918  		return
   919  	}
   920  
   921  	h := heapBitsForAddr(x)
   922  	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
   923  
   924  	// Heap bitmap bits for 2-word object are only 4 bits,
   925  	// so also shared with objects next to it.
   926  	// This is called out as a special case primarily for 32-bit systems,
   927  	// so that on 32-bit systems the code below can assume all objects
   928  	// are 4-word aligned (because they're all 16-byte aligned).
   929  	if size == 2*sys.PtrSize {
   930  		if typ.size == sys.PtrSize {
   931  			// We're allocating a block big enough to hold two pointers.
   932  			// On 64-bit, that means the actual object must be two pointers,
   933  			// or else we'd have used the one-pointer-sized block.
   934  			// On 32-bit, however, this is the 8-byte block, the smallest one.
   935  			// So it could be that we're allocating one pointer and this was
   936  			// just the smallest block available. Distinguish by checking dataSize.
   937  			// (In general the number of instances of typ being allocated is
   938  			// dataSize/typ.size.)
   939  			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
   940  				// 1 pointer object. On 32-bit machines clear the bit for the
   941  				// unused second word.
   942  				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
   943  				*h.bitp |= (bitPointer | bitScan) << h.shift
   944  			} else {
   945  				// 2-element slice of pointer.
   946  				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
   947  			}
   948  			return
   949  		}
   950  		// Otherwise typ.size must be 2*sys.PtrSize,
   951  		// and typ.kind&kindGCProg == 0.
   952  		if doubleCheck {
   953  			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
   954  				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
   955  				throw("heapBitsSetType")
   956  			}
   957  		}
   958  		b := uint32(*ptrmask)
   959  		hb := (b & 3) | bitScan
   960  		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
   961  		// 110011 is shifted h.shift and complemented.
   962  		// This clears out the bits that are about to be
   963  		// ored into *h.hbitp in the next instructions.
   964  		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
   965  		*h.bitp |= uint8(hb << h.shift)
   966  		return
   967  	}
   968  
   969  	// Copy from 1-bit ptrmask into 2-bit bitmap.
   970  	// The basic approach is to use a single uintptr as a bit buffer,
   971  	// alternating between reloading the buffer and writing bitmap bytes.
   972  	// In general, one load can supply two bitmap byte writes.
   973  	// This is a lot of lines of code, but it compiles into relatively few
   974  	// machine instructions.
   975  
   976  	var (
   977  		// Ptrmask input.
   978  		p     *byte   // last ptrmask byte read
   979  		b     uintptr // ptrmask bits already loaded
   980  		nb    uintptr // number of bits in b at next read
   981  		endp  *byte   // final ptrmask byte to read (then repeat)
   982  		endnb uintptr // number of valid bits in *endp
   983  		pbits uintptr // alternate source of bits
   984  
   985  		// Heap bitmap output.
   986  		w     uintptr // words processed
   987  		nw    uintptr // number of words to process
   988  		hbitp *byte   // next heap bitmap byte to write
   989  		hb    uintptr // bits being prepared for *hbitp
   990  	)
   991  
   992  	hbitp = h.bitp
   993  
   994  	// Handle GC program. Delayed until this part of the code
   995  	// so that we can use the same double-checking mechanism
   996  	// as the 1-bit case. Nothing above could have encountered
   997  	// GC programs: the cases were all too small.
   998  	if typ.kind&kindGCProg != 0 {
   999  		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
  1000  		if doubleCheck {
  1001  			// Double-check the heap bits written by GC program
  1002  			// by running the GC program to create a 1-bit pointer mask
  1003  			// and then jumping to the double-check code below.
  1004  			// This doesn't catch bugs shared between the 1-bit and 4-bit
  1005  			// GC program execution, but it does catch mistakes specific
  1006  			// to just one of those and bugs in heapBitsSetTypeGCProg's
  1007  			// implementation of arrays.
  1008  			lock(&debugPtrmask.lock)
  1009  			if debugPtrmask.data == nil {
  1010  				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
  1011  			}
  1012  			ptrmask = debugPtrmask.data
  1013  			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
  1014  			goto Phase4
  1015  		}
  1016  		return
  1017  	}
  1018  
  1019  	// Note about sizes:
  1020  	//
  1021  	// typ.size is the number of words in the object,
  1022  	// and typ.ptrdata is the number of words in the prefix
  1023  	// of the object that contains pointers. That is, the final
  1024  	// typ.size - typ.ptrdata words contain no pointers.
  1025  	// This allows optimization of a common pattern where
  1026  	// an object has a small header followed by a large scalar
  1027  	// buffer. If we know the pointers are over, we don't have
  1028  	// to scan the buffer's heap bitmap at all.
  1029  	// The 1-bit ptrmasks are sized to contain only bits for
  1030  	// the typ.ptrdata prefix, zero padded out to a full byte
  1031  	// of bitmap. This code sets nw (below) so that heap bitmap
  1032  	// bits are only written for the typ.ptrdata prefix; if there is
  1033  	// more room in the allocated object, the next heap bitmap
  1034  	// entry is a 00, indicating that there are no more pointers
  1035  	// to scan. So only the ptrmask for the ptrdata bytes is needed.
  1036  	//
  1037  	// Replicated copies are not as nice: if there is an array of
  1038  	// objects with scalar tails, all but the last tail does have to
  1039  	// be initialized, because there is no way to say "skip forward".
  1040  	// However, because of the possibility of a repeated type with
  1041  	// size not a multiple of 4 pointers (one heap bitmap byte),
  1042  	// the code already must handle the last ptrmask byte specially
  1043  	// by treating it as containing only the bits for endnb pointers,
  1044  	// where endnb <= 4. We represent large scalar tails that must
  1045  	// be expanded in the replication by setting endnb larger than 4.
  1046  	// This will have the effect of reading many bits out of b,
  1047  	// but once the real bits are shifted out, b will supply as many
  1048  	// zero bits as we try to read, which is exactly what we need.
  1049  
  1050  	p = ptrmask
  1051  	if typ.size < dataSize {
  1052  		// Filling in bits for an array of typ.
  1053  		// Set up for repetition of ptrmask during main loop.
  1054  		// Note that ptrmask describes only a prefix of
  1055  		const maxBits = sys.PtrSize*8 - 7
  1056  		if typ.ptrdata/sys.PtrSize <= maxBits {
  1057  			// Entire ptrmask fits in uintptr with room for a byte fragment.
  1058  			// Load into pbits and never read from ptrmask again.
  1059  			// This is especially important when the ptrmask has
  1060  			// fewer than 8 bits in it; otherwise the reload in the middle
  1061  			// of the Phase 2 loop would itself need to loop to gather
  1062  			// at least 8 bits.
  1063  
  1064  			// Accumulate ptrmask into b.
  1065  			// ptrmask is sized to describe only typ.ptrdata, but we record
  1066  			// it as describing typ.size bytes, since all the high bits are zero.
  1067  			nb = typ.ptrdata / sys.PtrSize
  1068  			for i := uintptr(0); i < nb; i += 8 {
  1069  				b |= uintptr(*p) << i
  1070  				p = add1(p)
  1071  			}
  1072  			nb = typ.size / sys.PtrSize
  1073  
  1074  			// Replicate ptrmask to fill entire pbits uintptr.
  1075  			// Doubling and truncating is fewer steps than
  1076  			// iterating by nb each time. (nb could be 1.)
  1077  			// Since we loaded typ.ptrdata/sys.PtrSize bits
  1078  			// but are pretending to have typ.size/sys.PtrSize,
  1079  			// there might be no replication necessary/possible.
  1080  			pbits = b
  1081  			endnb = nb
  1082  			if nb+nb <= maxBits {
  1083  				for endnb <= sys.PtrSize*8 {
  1084  					pbits |= pbits << endnb
  1085  					endnb += endnb
  1086  				}
  1087  				// Truncate to a multiple of original ptrmask.
  1088  				endnb = maxBits / nb * nb
  1089  				pbits &= 1<<endnb - 1
  1090  				b = pbits
  1091  				nb = endnb
  1092  			}
  1093  
  1094  			// Clear p and endp as sentinel for using pbits.
  1095  			// Checked during Phase 2 loop.
  1096  			p = nil
  1097  			endp = nil
  1098  		} else {
  1099  			// Ptrmask is larger. Read it multiple times.
  1100  			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
  1101  			endp = addb(ptrmask, n)
  1102  			endnb = typ.size/sys.PtrSize - n*8
  1103  		}
  1104  	}
  1105  	if p != nil {
  1106  		b = uintptr(*p)
  1107  		p = add1(p)
  1108  		nb = 8
  1109  	}
  1110  
  1111  	if typ.size == dataSize {
  1112  		// Single entry: can stop once we reach the non-pointer data.
  1113  		nw = typ.ptrdata / sys.PtrSize
  1114  	} else {
  1115  		// Repeated instances of typ in an array.
  1116  		// Have to process first N-1 entries in full, but can stop
  1117  		// once we reach the non-pointer data in the final entry.
  1118  		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
  1119  	}
  1120  	if nw == 0 {
  1121  		// No pointers! Caller was supposed to check.
  1122  		println("runtime: invalid type ", typ.string())
  1123  		throw("heapBitsSetType: called with non-pointer type")
  1124  		return
  1125  	}
  1126  	if nw < 2 {
  1127  		// Must write at least 2 words, because the "no scan"
  1128  		// encoding doesn't take effect until the third word.
  1129  		nw = 2
  1130  	}
  1131  
  1132  	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
  1133  	// The leading byte is special because it contains the bits for word 1,
  1134  	// which does not have the scan bit set.
  1135  	// The leading half-byte is special because it's a half a byte,
  1136  	// so we have to be careful with the bits already there.
  1137  	switch {
  1138  	default:
  1139  		throw("heapBitsSetType: unexpected shift")
  1140  
  1141  	case h.shift == 0:
  1142  		// Ptrmask and heap bitmap are aligned.
  1143  		// Handle first byte of bitmap specially.
  1144  		//
  1145  		// The first byte we write out covers the first four
  1146  		// words of the object. The scan/dead bit on the first
  1147  		// word must be set to scan since there are pointers
  1148  		// somewhere in the object. The scan/dead bit on the
  1149  		// second word is the checkmark, so we don't set it.
  1150  		// In all following words, we set the scan/dead
  1151  		// appropriately to indicate that the object contains
  1152  		// to the next 2-bit entry in the bitmap.
  1153  		//
  1154  		// TODO: It doesn't matter if we set the checkmark, so
  1155  		// maybe this case isn't needed any more.
  1156  		hb = b & bitPointerAll
  1157  		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
  1158  		if w += 4; w >= nw {
  1159  			goto Phase3
  1160  		}
  1161  		*hbitp = uint8(hb)
  1162  		hbitp = subtract1(hbitp)
  1163  		b >>= 4
  1164  		nb -= 4
  1165  
  1166  	case sys.PtrSize == 8 && h.shift == 2:
  1167  		// Ptrmask and heap bitmap are misaligned.
  1168  		// The bits for the first two words are in a byte shared
  1169  		// with another object, so we must be careful with the bits
  1170  		// already there.
  1171  		// We took care of 1-word and 2-word objects above,
  1172  		// so this is at least a 6-word object.
  1173  		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
  1174  		// This is not noscan, so set the scan bit in the
  1175  		// first word.
  1176  		hb |= bitScan << (2 * heapBitsShift)
  1177  		b >>= 2
  1178  		nb -= 2
  1179  		// Note: no bitScan for second word because that's
  1180  		// the checkmark.
  1181  		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
  1182  		*hbitp |= uint8(hb)
  1183  		hbitp = subtract1(hbitp)
  1184  		if w += 2; w >= nw {
  1185  			// We know that there is more data, because we handled 2-word objects above.
  1186  			// This must be at least a 6-word object. If we're out of pointer words,
  1187  			// mark no scan in next bitmap byte and finish.
  1188  			hb = 0
  1189  			w += 4
  1190  			goto Phase3
  1191  		}
  1192  	}
  1193  
  1194  	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
  1195  	// The loop computes the bits for that last write but does not execute the write;
  1196  	// it leaves the bits in hb for processing by phase 3.
  1197  	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
  1198  	// use in the first half of the loop right now, and then we only adjust nb explicitly
  1199  	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
  1200  	nb -= 4
  1201  	for {
  1202  		// Emit bitmap byte.
  1203  		// b has at least nb+4 bits, with one exception:
  1204  		// if w+4 >= nw, then b has only nw-w bits,
  1205  		// but we'll stop at the break and then truncate
  1206  		// appropriately in Phase 3.
  1207  		hb = b & bitPointerAll
  1208  		hb |= bitScanAll
  1209  		if w += 4; w >= nw {
  1210  			break
  1211  		}
  1212  		*hbitp = uint8(hb)
  1213  		hbitp = subtract1(hbitp)
  1214  		b >>= 4
  1215  
  1216  		// Load more bits. b has nb right now.
  1217  		if p != endp {
  1218  			// Fast path: keep reading from ptrmask.
  1219  			// nb unmodified: we just loaded 8 bits,
  1220  			// and the next iteration will consume 8 bits,
  1221  			// leaving us with the same nb the next time we're here.
  1222  			if nb < 8 {
  1223  				b |= uintptr(*p) << nb
  1224  				p = add1(p)
  1225  			} else {
  1226  				// Reduce the number of bits in b.
  1227  				// This is important if we skipped
  1228  				// over a scalar tail, since nb could
  1229  				// be larger than the bit width of b.
  1230  				nb -= 8
  1231  			}
  1232  		} else if p == nil {
  1233  			// Almost as fast path: track bit count and refill from pbits.
  1234  			// For short repetitions.
  1235  			if nb < 8 {
  1236  				b |= pbits << nb
  1237  				nb += endnb
  1238  			}
  1239  			nb -= 8 // for next iteration
  1240  		} else {
  1241  			// Slow path: reached end of ptrmask.
  1242  			// Process final partial byte and rewind to start.
  1243  			b |= uintptr(*p) << nb
  1244  			nb += endnb
  1245  			if nb < 8 {
  1246  				b |= uintptr(*ptrmask) << nb
  1247  				p = add1(ptrmask)
  1248  			} else {
  1249  				nb -= 8
  1250  				p = ptrmask
  1251  			}
  1252  		}
  1253  
  1254  		// Emit bitmap byte.
  1255  		hb = b & bitPointerAll
  1256  		hb |= bitScanAll
  1257  		if w += 4; w >= nw {
  1258  			break
  1259  		}
  1260  		*hbitp = uint8(hb)
  1261  		hbitp = subtract1(hbitp)
  1262  		b >>= 4
  1263  	}
  1264  
  1265  Phase3:
  1266  	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
  1267  	if w > nw {
  1268  		// Counting the 4 entries in hb not yet written to memory,
  1269  		// there are more entries than possible pointer slots.
  1270  		// Discard the excess entries (can't be more than 3).
  1271  		mask := uintptr(1)<<(4-(w-nw)) - 1
  1272  		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
  1273  	}
  1274  
  1275  	// Change nw from counting possibly-pointer words to total words in allocation.
  1276  	nw = size / sys.PtrSize
  1277  
  1278  	// Write whole bitmap bytes.
  1279  	// The first is hb, the rest are zero.
  1280  	if w <= nw {
  1281  		*hbitp = uint8(hb)
  1282  		hbitp = subtract1(hbitp)
  1283  		hb = 0 // for possible final half-byte below
  1284  		for w += 4; w <= nw; w += 4 {
  1285  			*hbitp = 0
  1286  			hbitp = subtract1(hbitp)
  1287  		}
  1288  	}
  1289  
  1290  	// Write final partial bitmap byte if any.
  1291  	// We know w > nw, or else we'd still be in the loop above.
  1292  	// It can be bigger only due to the 4 entries in hb that it counts.
  1293  	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
  1294  	// and can discard the 4 sitting in hb.
  1295  	// But if w == nw+2, we need to write first two in hb.
  1296  	// The byte is shared with the next object, so be careful with
  1297  	// existing bits.
  1298  	if w == nw+2 {
  1299  		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
  1300  	}
  1301  
  1302  Phase4:
  1303  	// Phase 4: all done, but perhaps double check.
  1304  	if doubleCheck {
  1305  		end := heapBitsForAddr(x + size)
  1306  		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
  1307  			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
  1308  			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
  1309  			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
  1310  			h0 := heapBitsForAddr(x)
  1311  			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
  1312  			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
  1313  			throw("bad heapBitsSetType")
  1314  		}
  1315  
  1316  		// Double-check that bits to be written were written correctly.
  1317  		// Does not check that other bits were not written, unfortunately.
  1318  		h := heapBitsForAddr(x)
  1319  		nptr := typ.ptrdata / sys.PtrSize
  1320  		ndata := typ.size / sys.PtrSize
  1321  		count := dataSize / typ.size
  1322  		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
  1323  		for i := uintptr(0); i < size/sys.PtrSize; i++ {
  1324  			j := i % ndata
  1325  			var have, want uint8
  1326  			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
  1327  			if i >= totalptr {
  1328  				want = 0 // deadmarker
  1329  				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
  1330  					want = bitScan
  1331  				}
  1332  			} else {
  1333  				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
  1334  					want |= bitPointer
  1335  				}
  1336  				if i != 1 {
  1337  					want |= bitScan
  1338  				} else {
  1339  					have &^= bitScan
  1340  				}
  1341  			}
  1342  			if have != want {
  1343  				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
  1344  				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
  1345  				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
  1346  				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
  1347  				h0 := heapBitsForAddr(x)
  1348  				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
  1349  				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
  1350  				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
  1351  				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
  1352  				if typ.kind&kindGCProg != 0 {
  1353  					println("GC program:")
  1354  					dumpGCProg(addb(typ.gcdata, 4))
  1355  				}
  1356  				throw("bad heapBitsSetType")
  1357  			}
  1358  			h = h.next()
  1359  		}
  1360  		if ptrmask == debugPtrmask.data {
  1361  			unlock(&debugPtrmask.lock)
  1362  		}
  1363  	}
  1364  }
  1365  
  1366  // heapBitsSetTypeNoScan marks x as noscan by setting the first word
  1367  // of x in the heap bitmap to scalar/dead.
  1368  func heapBitsSetTypeNoScan(x uintptr) {
  1369  	h := heapBitsForAddr(uintptr(x))
  1370  	*h.bitp &^= (bitPointer | bitScan) << h.shift
  1371  }
  1372  
  1373  var debugPtrmask struct {
  1374  	lock mutex
  1375  	data *byte
  1376  }
  1377  
  1378  // heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
  1379  // progSize is the size of the memory described by the program.
  1380  // elemSize is the size of the element that the GC program describes (a prefix of).
  1381  // dataSize is the total size of the intended data, a multiple of elemSize.
  1382  // allocSize is the total size of the allocated memory.
  1383  //
  1384  // GC programs are only used for large allocations.
  1385  // heapBitsSetType requires that allocSize is a multiple of 4 words,
  1386  // so that the relevant bitmap bytes are not shared with surrounding
  1387  // objects.
  1388  func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
  1389  	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
  1390  		// Alignment will be wrong.
  1391  		throw("heapBitsSetTypeGCProg: small allocation")
  1392  	}
  1393  	var totalBits uintptr
  1394  	if elemSize == dataSize {
  1395  		totalBits = runGCProg(prog, nil, h.bitp, 2)
  1396  		if totalBits*sys.PtrSize != progSize {
  1397  			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
  1398  			throw("heapBitsSetTypeGCProg: unexpected bit count")
  1399  		}
  1400  	} else {
  1401  		count := dataSize / elemSize
  1402  
  1403  		// Piece together program trailer to run after prog that does:
  1404  		//	literal(0)
  1405  		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
  1406  		//	repeat(elemSize, count-1) // repeat that element for count
  1407  		// This zero-pads the data remaining in the first element and then
  1408  		// repeats that first element to fill the array.
  1409  		var trailer [40]byte // 3 varints (max 10 each) + some bytes
  1410  		i := 0
  1411  		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
  1412  			// literal(0)
  1413  			trailer[i] = 0x01
  1414  			i++
  1415  			trailer[i] = 0
  1416  			i++
  1417  			if n > 1 {
  1418  				// repeat(1, n-1)
  1419  				trailer[i] = 0x81
  1420  				i++
  1421  				n--
  1422  				for ; n >= 0x80; n >>= 7 {
  1423  					trailer[i] = byte(n | 0x80)
  1424  					i++
  1425  				}
  1426  				trailer[i] = byte(n)
  1427  				i++
  1428  			}
  1429  		}
  1430  		// repeat(elemSize/ptrSize, count-1)
  1431  		trailer[i] = 0x80
  1432  		i++
  1433  		n := elemSize / sys.PtrSize
  1434  		for ; n >= 0x80; n >>= 7 {
  1435  			trailer[i] = byte(n | 0x80)
  1436  			i++
  1437  		}
  1438  		trailer[i] = byte(n)
  1439  		i++
  1440  		n = count - 1
  1441  		for ; n >= 0x80; n >>= 7 {
  1442  			trailer[i] = byte(n | 0x80)
  1443  			i++
  1444  		}
  1445  		trailer[i] = byte(n)
  1446  		i++
  1447  		trailer[i] = 0
  1448  		i++
  1449  
  1450  		runGCProg(prog, &trailer[0], h.bitp, 2)
  1451  
  1452  		// Even though we filled in the full array just now,
  1453  		// record that we only filled in up to the ptrdata of the
  1454  		// last element. This will cause the code below to
  1455  		// memclr the dead section of the final array element,
  1456  		// so that scanobject can stop early in the final element.
  1457  		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
  1458  	}
  1459  	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
  1460  	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
  1461  	memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
  1462  }
  1463  
  1464  // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
  1465  // size the size of the region described by prog, in bytes.
  1466  // The resulting bitvector will have no more than size/sys.PtrSize bits.
  1467  func progToPointerMask(prog *byte, size uintptr) bitvector {
  1468  	n := (size/sys.PtrSize + 7) / 8
  1469  	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
  1470  	x[len(x)-1] = 0xa1 // overflow check sentinel
  1471  	n = runGCProg(prog, nil, &x[0], 1)
  1472  	if x[len(x)-1] != 0xa1 {
  1473  		throw("progToPointerMask: overflow")
  1474  	}
  1475  	return bitvector{int32(n), &x[0]}
  1476  }
  1477  
  1478  // Packed GC pointer bitmaps, aka GC programs.
  1479  //
  1480  // For large types containing arrays, the type information has a
  1481  // natural repetition that can be encoded to save space in the
  1482  // binary and in the memory representation of the type information.
  1483  //
  1484  // The encoding is a simple Lempel-Ziv style bytecode machine
  1485  // with the following instructions:
  1486  //
  1487  //	00000000: stop
  1488  //	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
  1489  //	10000000 n c: repeat the previous n bits c times; n, c are varints
  1490  //	1nnnnnnn c: repeat the previous n bits c times; c is a varint
  1491  
  1492  // runGCProg executes the GC program prog, and then trailer if non-nil,
  1493  // writing to dst with entries of the given size.
  1494  // If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
  1495  // If size == 2, dst is the 2-bit heap bitmap, and writes move backward
  1496  // starting at dst (because the heap bitmap does). In this case, the caller guarantees
  1497  // that only whole bytes in dst need to be written.
  1498  //
  1499  // runGCProg returns the number of 1- or 2-bit entries written to memory.
  1500  func runGCProg(prog, trailer, dst *byte, size int) uintptr {
  1501  	dstStart := dst
  1502  
  1503  	// Bits waiting to be written to memory.
  1504  	var bits uintptr
  1505  	var nbits uintptr
  1506  
  1507  	p := prog
  1508  Run:
  1509  	for {
  1510  		// Flush accumulated full bytes.
  1511  		// The rest of the loop assumes that nbits <= 7.
  1512  		for ; nbits >= 8; nbits -= 8 {
  1513  			if size == 1 {
  1514  				*dst = uint8(bits)
  1515  				dst = add1(dst)
  1516  				bits >>= 8
  1517  			} else {
  1518  				v := bits&bitPointerAll | bitScanAll
  1519  				*dst = uint8(v)
  1520  				dst = subtract1(dst)
  1521  				bits >>= 4
  1522  				v = bits&bitPointerAll | bitScanAll
  1523  				*dst = uint8(v)
  1524  				dst = subtract1(dst)
  1525  				bits >>= 4
  1526  			}
  1527  		}
  1528  
  1529  		// Process one instruction.
  1530  		inst := uintptr(*p)
  1531  		p = add1(p)
  1532  		n := inst & 0x7F
  1533  		if inst&0x80 == 0 {
  1534  			// Literal bits; n == 0 means end of program.
  1535  			if n == 0 {
  1536  				// Program is over; continue in trailer if present.
  1537  				if trailer != nil {
  1538  					//println("trailer")
  1539  					p = trailer
  1540  					trailer = nil
  1541  					continue
  1542  				}
  1543  				//println("done")
  1544  				break Run
  1545  			}
  1546  			//println("lit", n, dst)
  1547  			nbyte := n / 8
  1548  			for i := uintptr(0); i < nbyte; i++ {
  1549  				bits |= uintptr(*p) << nbits
  1550  				p = add1(p)
  1551  				if size == 1 {
  1552  					*dst = uint8(bits)
  1553  					dst = add1(dst)
  1554  					bits >>= 8
  1555  				} else {
  1556  					v := bits&0xf | bitScanAll
  1557  					*dst = uint8(v)
  1558  					dst = subtract1(dst)
  1559  					bits >>= 4
  1560  					v = bits&0xf | bitScanAll
  1561  					*dst = uint8(v)
  1562  					dst = subtract1(dst)
  1563  					bits >>= 4
  1564  				}
  1565  			}
  1566  			if n %= 8; n > 0 {
  1567  				bits |= uintptr(*p) << nbits
  1568  				p = add1(p)
  1569  				nbits += n
  1570  			}
  1571  			continue Run
  1572  		}
  1573  
  1574  		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
  1575  		if n == 0 {
  1576  			for off := uint(0); ; off += 7 {
  1577  				x := uintptr(*p)
  1578  				p = add1(p)
  1579  				n |= (x & 0x7F) << off
  1580  				if x&0x80 == 0 {
  1581  					break
  1582  				}
  1583  			}
  1584  		}
  1585  
  1586  		// Count is encoded in a varint in the next bytes.
  1587  		c := uintptr(0)
  1588  		for off := uint(0); ; off += 7 {
  1589  			x := uintptr(*p)
  1590  			p = add1(p)
  1591  			c |= (x & 0x7F) << off
  1592  			if x&0x80 == 0 {
  1593  				break
  1594  			}
  1595  		}
  1596  		c *= n // now total number of bits to copy
  1597  
  1598  		// If the number of bits being repeated is small, load them
  1599  		// into a register and use that register for the entire loop
  1600  		// instead of repeatedly reading from memory.
  1601  		// Handling fewer than 8 bits here makes the general loop simpler.
  1602  		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
  1603  		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
  1604  		// it will not overflow.
  1605  		src := dst
  1606  		const maxBits = sys.PtrSize*8 - 7
  1607  		if n <= maxBits {
  1608  			// Start with bits in output buffer.
  1609  			pattern := bits
  1610  			npattern := nbits
  1611  
  1612  			// If we need more bits, fetch them from memory.
  1613  			if size == 1 {
  1614  				src = subtract1(src)
  1615  				for npattern < n {
  1616  					pattern <<= 8
  1617  					pattern |= uintptr(*src)
  1618  					src = subtract1(src)
  1619  					npattern += 8
  1620  				}
  1621  			} else {
  1622  				src = add1(src)
  1623  				for npattern < n {
  1624  					pattern <<= 4
  1625  					pattern |= uintptr(*src) & 0xf
  1626  					src = add1(src)
  1627  					npattern += 4
  1628  				}
  1629  			}
  1630  
  1631  			// We started with the whole bit output buffer,
  1632  			// and then we loaded bits from whole bytes.
  1633  			// Either way, we might now have too many instead of too few.
  1634  			// Discard the extra.
  1635  			if npattern > n {
  1636  				pattern >>= npattern - n
  1637  				npattern = n
  1638  			}
  1639  
  1640  			// Replicate pattern to at most maxBits.
  1641  			if npattern == 1 {
  1642  				// One bit being repeated.
  1643  				// If the bit is 1, make the pattern all 1s.
  1644  				// If the bit is 0, the pattern is already all 0s,
  1645  				// but we can claim that the number of bits
  1646  				// in the word is equal to the number we need (c),
  1647  				// because right shift of bits will zero fill.
  1648  				if pattern == 1 {
  1649  					pattern = 1<<maxBits - 1
  1650  					npattern = maxBits
  1651  				} else {
  1652  					npattern = c
  1653  				}
  1654  			} else {
  1655  				b := pattern
  1656  				nb := npattern
  1657  				if nb+nb <= maxBits {
  1658  					// Double pattern until the whole uintptr is filled.
  1659  					for nb <= sys.PtrSize*8 {
  1660  						b |= b << nb
  1661  						nb += nb
  1662  					}
  1663  					// Trim away incomplete copy of original pattern in high bits.
  1664  					// TODO(rsc): Replace with table lookup or loop on systems without divide?
  1665  					nb = maxBits / npattern * npattern
  1666  					b &= 1<<nb - 1
  1667  					pattern = b
  1668  					npattern = nb
  1669  				}
  1670  			}
  1671  
  1672  			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
  1673  			// Since pattern contains >8 bits, there will be full bytes to flush
  1674  			// on each iteration.
  1675  			for ; c >= npattern; c -= npattern {
  1676  				bits |= pattern << nbits
  1677  				nbits += npattern
  1678  				if size == 1 {
  1679  					for nbits >= 8 {
  1680  						*dst = uint8(bits)
  1681  						dst = add1(dst)
  1682  						bits >>= 8
  1683  						nbits -= 8
  1684  					}
  1685  				} else {
  1686  					for nbits >= 4 {
  1687  						*dst = uint8(bits&0xf | bitScanAll)
  1688  						dst = subtract1(dst)
  1689  						bits >>= 4
  1690  						nbits -= 4
  1691  					}
  1692  				}
  1693  			}
  1694  
  1695  			// Add final fragment to bit buffer.
  1696  			if c > 0 {
  1697  				pattern &= 1<<c - 1
  1698  				bits |= pattern << nbits
  1699  				nbits += c
  1700  			}
  1701  			continue Run
  1702  		}
  1703  
  1704  		// Repeat; n too large to fit in a register.
  1705  		// Since nbits <= 7, we know the first few bytes of repeated data
  1706  		// are already written to memory.
  1707  		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
  1708  		if size == 1 {
  1709  			// Leading src fragment.
  1710  			src = subtractb(src, (off+7)/8)
  1711  			if frag := off & 7; frag != 0 {
  1712  				bits |= uintptr(*src) >> (8 - frag) << nbits
  1713  				src = add1(src)
  1714  				nbits += frag
  1715  				c -= frag
  1716  			}
  1717  			// Main loop: load one byte, write another.
  1718  			// The bits are rotating through the bit buffer.
  1719  			for i := c / 8; i > 0; i-- {
  1720  				bits |= uintptr(*src) << nbits
  1721  				src = add1(src)
  1722  				*dst = uint8(bits)
  1723  				dst = add1(dst)
  1724  				bits >>= 8
  1725  			}
  1726  			// Final src fragment.
  1727  			if c %= 8; c > 0 {
  1728  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
  1729  				nbits += c
  1730  			}
  1731  		} else {
  1732  			// Leading src fragment.
  1733  			src = addb(src, (off+3)/4)
  1734  			if frag := off & 3; frag != 0 {
  1735  				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
  1736  				src = subtract1(src)
  1737  				nbits += frag
  1738  				c -= frag
  1739  			}
  1740  			// Main loop: load one byte, write another.
  1741  			// The bits are rotating through the bit buffer.
  1742  			for i := c / 4; i > 0; i-- {
  1743  				bits |= (uintptr(*src) & 0xf) << nbits
  1744  				src = subtract1(src)
  1745  				*dst = uint8(bits&0xf | bitScanAll)
  1746  				dst = subtract1(dst)
  1747  				bits >>= 4
  1748  			}
  1749  			// Final src fragment.
  1750  			if c %= 4; c > 0 {
  1751  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
  1752  				nbits += c
  1753  			}
  1754  		}
  1755  	}
  1756  
  1757  	// Write any final bits out, using full-byte writes, even for the final byte.
  1758  	var totalBits uintptr
  1759  	if size == 1 {
  1760  		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
  1761  		nbits += -nbits & 7
  1762  		for ; nbits > 0; nbits -= 8 {
  1763  			*dst = uint8(bits)
  1764  			dst = add1(dst)
  1765  			bits >>= 8
  1766  		}
  1767  	} else {
  1768  		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
  1769  		nbits += -nbits & 3
  1770  		for ; nbits > 0; nbits -= 4 {
  1771  			v := bits&0xf | bitScanAll
  1772  			*dst = uint8(v)
  1773  			dst = subtract1(dst)
  1774  			bits >>= 4
  1775  		}
  1776  	}
  1777  	return totalBits
  1778  }
  1779  
  1780  func dumpGCProg(p *byte) {
  1781  	nptr := 0
  1782  	for {
  1783  		x := *p
  1784  		p = add1(p)
  1785  		if x == 0 {
  1786  			print("\t", nptr, " end\n")
  1787  			break
  1788  		}
  1789  		if x&0x80 == 0 {
  1790  			print("\t", nptr, " lit ", x, ":")
  1791  			n := int(x+7) / 8
  1792  			for i := 0; i < n; i++ {
  1793  				print(" ", hex(*p))
  1794  				p = add1(p)
  1795  			}
  1796  			print("\n")
  1797  			nptr += int(x)
  1798  		} else {
  1799  			nbit := int(x &^ 0x80)
  1800  			if nbit == 0 {
  1801  				for nb := uint(0); ; nb += 7 {
  1802  					x := *p
  1803  					p = add1(p)
  1804  					nbit |= int(x&0x7f) << nb
  1805  					if x&0x80 == 0 {
  1806  						break
  1807  					}
  1808  				}
  1809  			}
  1810  			count := 0
  1811  			for nb := uint(0); ; nb += 7 {
  1812  				x := *p
  1813  				p = add1(p)
  1814  				count |= int(x&0x7f) << nb
  1815  				if x&0x80 == 0 {
  1816  					break
  1817  				}
  1818  			}
  1819  			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
  1820  			nptr += nbit * count
  1821  		}
  1822  	}
  1823  }
  1824  
  1825  // Testing.
  1826  
  1827  func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
  1828  	target := (*stkframe)(ctxt)
  1829  	if frame.sp <= target.sp && target.sp < frame.varp {
  1830  		*target = *frame
  1831  		return false
  1832  	}
  1833  	return true
  1834  }
  1835  
  1836  // gcbits returns the GC type info for x, for testing.
  1837  // The result is the bitmap entries (0 or 1), one entry per byte.
  1838  //go:linkname reflect_gcbits reflect.gcbits
  1839  func reflect_gcbits(x interface{}) []byte {
  1840  	ret := getgcmask(x)
  1841  	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
  1842  	nptr := typ.ptrdata / sys.PtrSize
  1843  	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
  1844  		ret = ret[:len(ret)-1]
  1845  	}
  1846  	return ret
  1847  }
  1848  
  1849  // Returns GC type info for object p for testing.
  1850  func getgcmask(ep interface{}) (mask []byte) {
  1851  	e := *efaceOf(&ep)
  1852  	p := e.data
  1853  	t := e._type
  1854  	// data or bss
  1855  	for _, datap := range activeModules() {
  1856  		// data
  1857  		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
  1858  			bitmap := datap.gcdatamask.bytedata
  1859  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1860  			mask = make([]byte, n/sys.PtrSize)
  1861  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1862  				off := (uintptr(p) + i - datap.data) / sys.PtrSize
  1863  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1864  			}
  1865  			return
  1866  		}
  1867  
  1868  		// bss
  1869  		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
  1870  			bitmap := datap.gcbssmask.bytedata
  1871  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1872  			mask = make([]byte, n/sys.PtrSize)
  1873  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1874  				off := (uintptr(p) + i - datap.bss) / sys.PtrSize
  1875  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1876  			}
  1877  			return
  1878  		}
  1879  	}
  1880  
  1881  	// heap
  1882  	var n uintptr
  1883  	var base uintptr
  1884  	if mlookup(uintptr(p), &base, &n, nil) != 0 {
  1885  		mask = make([]byte, n/sys.PtrSize)
  1886  		for i := uintptr(0); i < n; i += sys.PtrSize {
  1887  			hbits := heapBitsForAddr(base + i)
  1888  			if hbits.isPointer() {
  1889  				mask[i/sys.PtrSize] = 1
  1890  			}
  1891  			if i != 1*sys.PtrSize && !hbits.morePointers() {
  1892  				mask = mask[:i/sys.PtrSize]
  1893  				break
  1894  			}
  1895  		}
  1896  		return
  1897  	}
  1898  
  1899  	// stack
  1900  	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
  1901  		var frame stkframe
  1902  		frame.sp = uintptr(p)
  1903  		_g_ := getg()
  1904  		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
  1905  		if frame.fn != nil {
  1906  			f := frame.fn
  1907  			targetpc := frame.continpc
  1908  			if targetpc == 0 {
  1909  				return
  1910  			}
  1911  			if targetpc != f.entry {
  1912  				targetpc--
  1913  			}
  1914  			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
  1915  			if pcdata == -1 {
  1916  				return
  1917  			}
  1918  			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
  1919  			if stkmap == nil || stkmap.n <= 0 {
  1920  				return
  1921  			}
  1922  			bv := stackmapdata(stkmap, pcdata)
  1923  			size := uintptr(bv.n) * sys.PtrSize
  1924  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1925  			mask = make([]byte, n/sys.PtrSize)
  1926  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1927  				bitmap := bv.bytedata
  1928  				off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
  1929  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1930  			}
  1931  		}
  1932  		return
  1933  	}
  1934  
  1935  	// otherwise, not something the GC knows about.
  1936  	// possibly read-only data, like malloc(0).
  1937  	// must not have pointers
  1938  	return
  1939  }