github.com/zxy12/golang_with_comment@v0.0.0-20190701084843-0e6b2aff5ef3/runtime/mbitmap.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Garbage collector: type and heap bitmaps.
     6  //
     7  // Stack, data, and bss bitmaps
     8  //
     9  // Stack frames and global variables in the data and bss sections are described
    10  // by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
    11  // to be visited during GC. The bits in each byte are consumed starting with
    12  // the low bit: 1<<0, 1<<1, and so on.
    13  //
    14  // Heap bitmap
    15  //
    16  // The allocated heap comes from a subset of the memory in the range [start, used),
    17  // where start == mheap_.arena_start and used == mheap_.arena_used.
    18  // The heap bitmap comprises 2 bits for each pointer-sized word in that range,
    19  // stored in bytes indexed backward in memory from start.
    20  // That is, the byte at address start-1 holds the 2-bit entries for the four words
    21  // start through start+3*ptrSize, the byte at start-2 holds the entries for
    22  // start+4*ptrSize through start+7*ptrSize, and so on.
    23  //
    24  // In each 2-bit entry, the lower bit holds the same information as in the 1-bit
    25  // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
    26  // The meaning of the high bit depends on the position of the word being described
    27  // in its allocated object. In all words *except* the second word, the
    28  // high bit indicates that the object is still being described. In
    29  // these words, if a bit pair with a high bit 0 is encountered, the
    30  // low bit can also be assumed to be 0, and the object description is
    31  // over. This 00 is called the ``dead'' encoding: it signals that the
    32  // rest of the words in the object are uninteresting to the garbage
    33  // collector.
    34  //
    35  // In the second word, the high bit is the GC ``checkmarked'' bit (see below).
    36  //
    37  // The 2-bit entries are split when written into the byte, so that the top half
    38  // of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
    39  // bits.
    40  // This form allows a copy from the 1-bit to the 4-bit form to keep the
    41  // pointer bits contiguous, instead of having to space them out.
    42  //
    43  // The code makes use of the fact that the zero value for a heap bitmap
    44  // has no live pointer bit set and is (depending on position), not used,
    45  // not checkmarked, and is the dead encoding.
    46  // These properties must be preserved when modifying the encoding.
    47  //
    48  // The bitmap for noscan spans is not maintained. Code must ensure
    49  // that an object is scannable before consulting its bitmap by
    50  // checking either the noscan bit in the span or by consulting its
    51  // type's information.
    52  //
    53  // Checkmarks
    54  //
    55  // In a concurrent garbage collector, one worries about failing to mark
    56  // a live object due to mutations without write barriers or bugs in the
    57  // collector implementation. As a sanity check, the GC has a 'checkmark'
    58  // mode that retraverses the object graph with the world stopped, to make
    59  // sure that everything that should be marked is marked.
    60  // In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
    61  // for the second word of the object holds the checkmark bit.
    62  // When not in checkmark mode, this bit is set to 1.
    63  //
    64  // The smallest possible allocation is 8 bytes. On a 32-bit machine, that
    65  // means every allocated object has two words, so there is room for the
    66  // checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
    67  // just one word, so the second bit pair is not available for encoding the
    68  // checkmark. However, because non-pointer allocations are combined
    69  // into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
    70  // must be a pointer, so the type bit in the first word is not actually needed.
    71  // It is still used in general, except in checkmark the type bit is repurposed
    72  // as the checkmark bit and then reinitialized (to 1) as the type bit when
    73  // finished.
    74  //
    75  
    76  package runtime
    77  
    78  import (
    79  	"runtime/internal/atomic"
    80  	"runtime/internal/sys"
    81  	"unsafe"
    82  )
    83  
    84  const (
    85  	bitPointer = 1 << 0
    86  	bitScan    = 1 << 4
    87  
    88  	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
    89  	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
    90  
    91  	// all scan/pointer bits in a byte
    92  	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
    93  	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
    94  )
    95  
    96  // addb returns the byte pointer p+n.
    97  //go:nowritebarrier
    98  //go:nosplit
    99  func addb(p *byte, n uintptr) *byte {
   100  	// Note: wrote out full expression instead of calling add(p, n)
   101  	// to reduce the number of temporaries generated by the
   102  	// compiler for this trivial expression during inlining.
   103  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
   104  }
   105  
   106  // subtractb returns the byte pointer p-n.
   107  // subtractb is typically used when traversing the pointer tables referred to by hbits
   108  // which are arranged in reverse order.
   109  //go:nowritebarrier
   110  //go:nosplit
   111  func subtractb(p *byte, n uintptr) *byte {
   112  	// Note: wrote out full expression instead of calling add(p, -n)
   113  	// to reduce the number of temporaries generated by the
   114  	// compiler for this trivial expression during inlining.
   115  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
   116  }
   117  
   118  // add1 returns the byte pointer p+1.
   119  //go:nowritebarrier
   120  //go:nosplit
   121  func add1(p *byte) *byte {
   122  	// Note: wrote out full expression instead of calling addb(p, 1)
   123  	// to reduce the number of temporaries generated by the
   124  	// compiler for this trivial expression during inlining.
   125  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
   126  }
   127  
   128  // subtract1 returns the byte pointer p-1.
   129  // subtract1 is typically used when traversing the pointer tables referred to by hbits
   130  // which are arranged in reverse order.
   131  //go:nowritebarrier
   132  //
   133  // nosplit because it is used during write barriers and must not be preempted.
   134  //go:nosplit
   135  func subtract1(p *byte) *byte {
   136  	// Note: wrote out full expression instead of calling subtractb(p, 1)
   137  	// to reduce the number of temporaries generated by the
   138  	// compiler for this trivial expression during inlining.
   139  	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
   140  }
   141  
   142  // mapBits maps any additional bitmap memory needed for the new arena memory.
   143  //
   144  // Don't call this directly. Call mheap.setArenaUsed.
   145  //
   146  //go:nowritebarrier
   147  func (h *mheap) mapBits(arena_used uintptr) {
   148  	// Caller has added extra mappings to the arena.
   149  	// Add extra mappings of bitmap words as needed.
   150  	// We allocate extra bitmap pieces in chunks of bitmapChunk.
   151  	const bitmapChunk = 8192
   152  	//println("mapBits - start:", "arena_used=", arena_used, "mheap_.arena_start=", mheap_.arena_start)
   153  	//println("heapBitmapScale=", heapBitmapScale, "sys.PtrSize=", sys.PtrSize)
   154  	n := (arena_used - mheap_.arena_start) / heapBitmapScale
   155  
   156  	//s := uintptr(1)
   157  	//println("round 1=", round((s), bitmapChunk))
   158  	//println("round 1=", round((s), physPageSize))
   159  	n = round(n, bitmapChunk)
   160  	n = round(n, physPageSize)
   161  	//println("h.bitmap_mapped=", h.bitmap_mapped, "n = ", n)
   162  
   163  	// bitmap_mapped 用来记录虚拟内存实际使用了多少
   164  	//                                          n-bitmap_mapped
   165  	//                                            |    |
   166  	// |-------spans-------| ------- bitmap ------|====|====| ----------arena------------|
   167  	//                                        bitmap-n
   168  
   169  	if h.bitmap_mapped >= n {
   170  		return
   171  	}
   172  
   173  	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
   174  	h.bitmap_mapped = n
   175  }
   176  
   177  // heapBits provides access to the bitmap bits for a single heap word.
   178  // The methods on heapBits take value receivers so that the compiler
   179  // can more easily inline calls to those methods and registerize the
   180  // struct fields independently.
   181  type heapBits struct {
   182  	bitp  *uint8
   183  	shift uint32
   184  }
   185  
   186  // markBits provides access to the mark bit for an object in the heap.
   187  // bytep points to the byte holding the mark bit.
   188  // mask is a byte with a single bit set that can be &ed with *bytep
   189  // to see if the bit has been set.
   190  // *m.byte&m.mask != 0 indicates the mark bit is set.
   191  // index can be used along with span information to generate
   192  // the address of the object in the heap.
   193  // We maintain one set of mark bits for allocation and one for
   194  // marking purposes.
   195  type markBits struct {
   196  	bytep *uint8
   197  	mask  uint8
   198  	index uintptr
   199  }
   200  
   201  //go:nosplit
   202  func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
   203  	bytep, mask := s.allocBits.bitp(allocBitIndex)
   204  	return markBits{bytep, mask, allocBitIndex}
   205  }
   206  
   207  // refillaCache takes 8 bytes s.allocBits starting at whichByte
   208  // and negates them so that ctz (count trailing zeros) instructions
   209  // can be used. It then places these 8 bytes into the cached 64 bit
   210  // s.allocCache.
   211  func (s *mspan) refillAllocCache(whichByte uintptr) {
   212  	bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
   213  	aCache := uint64(0)
   214  	aCache |= uint64(bytes[0])
   215  	aCache |= uint64(bytes[1]) << (1 * 8)
   216  	aCache |= uint64(bytes[2]) << (2 * 8)
   217  	aCache |= uint64(bytes[3]) << (3 * 8)
   218  	aCache |= uint64(bytes[4]) << (4 * 8)
   219  	aCache |= uint64(bytes[5]) << (5 * 8)
   220  	aCache |= uint64(bytes[6]) << (6 * 8)
   221  	aCache |= uint64(bytes[7]) << (7 * 8)
   222  	s.allocCache = ^aCache
   223  }
   224  
   225  // nextFreeIndex returns the index of the next free object in s at
   226  // or after s.freeindex.
   227  // There are hardware instructions that can be used to make this
   228  // faster if profiling warrants it.
   229  func (s *mspan) nextFreeIndex() uintptr {
   230  	sfreeindex := s.freeindex
   231  	snelems := s.nelems
   232  	if sfreeindex == snelems {
   233  		return sfreeindex
   234  	}
   235  	if sfreeindex > snelems {
   236  		throw("s.freeindex > s.nelems")
   237  	}
   238  
   239  	aCache := s.allocCache
   240  
   241  	bitIndex := sys.Ctz64(aCache)
   242  	for bitIndex == 64 {
   243  		// Move index to start of next cached bits.
   244  		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
   245  		if sfreeindex >= snelems {
   246  			s.freeindex = snelems
   247  			return snelems
   248  		}
   249  		whichByte := sfreeindex / 8
   250  		// Refill s.allocCache with the next 64 alloc bits.
   251  		s.refillAllocCache(whichByte)
   252  		aCache = s.allocCache
   253  		bitIndex = sys.Ctz64(aCache)
   254  		// nothing available in cached bits
   255  		// grab the next 8 bytes and try again.
   256  	}
   257  	result := sfreeindex + uintptr(bitIndex)
   258  	if result >= snelems {
   259  		s.freeindex = snelems
   260  		return snelems
   261  	}
   262  
   263  	s.allocCache >>= uint(bitIndex + 1)
   264  	sfreeindex = result + 1
   265  
   266  	if sfreeindex%64 == 0 && sfreeindex != snelems {
   267  		// We just incremented s.freeindex so it isn't 0.
   268  		// As each 1 in s.allocCache was encountered and used for allocation
   269  		// it was shifted away. At this point s.allocCache contains all 0s.
   270  		// Refill s.allocCache so that it corresponds
   271  		// to the bits at s.allocBits starting at s.freeindex.
   272  		whichByte := sfreeindex / 8
   273  		s.refillAllocCache(whichByte)
   274  	}
   275  	s.freeindex = sfreeindex
   276  	return result
   277  }
   278  
   279  // isFree returns whether the index'th object in s is unallocated.
   280  func (s *mspan) isFree(index uintptr) bool {
   281  	if index < s.freeindex {
   282  		return false
   283  	}
   284  	bytep, mask := s.allocBits.bitp(index)
   285  	return *bytep&mask == 0
   286  }
   287  
   288  func (s *mspan) objIndex(p uintptr) uintptr {
   289  	byteOffset := p - s.base()
   290  	if byteOffset == 0 {
   291  		return 0
   292  	}
   293  	if s.baseMask != 0 {
   294  		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
   295  		return byteOffset >> s.divShift
   296  	}
   297  	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
   298  }
   299  
   300  func markBitsForAddr(p uintptr) markBits {
   301  	s := spanOf(p)
   302  	objIndex := s.objIndex(p)
   303  	return s.markBitsForIndex(objIndex)
   304  }
   305  
   306  func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
   307  	bytep, mask := s.gcmarkBits.bitp(objIndex)
   308  	return markBits{bytep, mask, objIndex}
   309  }
   310  
   311  func (s *mspan) markBitsForBase() markBits {
   312  	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
   313  }
   314  
   315  // isMarked reports whether mark bit m is set.
   316  func (m markBits) isMarked() bool {
   317  	return *m.bytep&m.mask != 0
   318  }
   319  
   320  // setMarked sets the marked bit in the markbits, atomically. Some compilers
   321  // are not able to inline atomic.Or8 function so if it appears as a hot spot consider
   322  // inlining it manually.
   323  func (m markBits) setMarked() {
   324  	// Might be racing with other updates, so use atomic update always.
   325  	// We used to be clever here and use a non-atomic update in certain
   326  	// cases, but it's not worth the risk.
   327  	atomic.Or8(m.bytep, m.mask)
   328  }
   329  
   330  // setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
   331  func (m markBits) setMarkedNonAtomic() {
   332  	*m.bytep |= m.mask
   333  }
   334  
   335  // clearMarked clears the marked bit in the markbits, atomically.
   336  func (m markBits) clearMarked() {
   337  	// Might be racing with other updates, so use atomic update always.
   338  	// We used to be clever here and use a non-atomic update in certain
   339  	// cases, but it's not worth the risk.
   340  	atomic.And8(m.bytep, ^m.mask)
   341  }
   342  
   343  // markBitsForSpan returns the markBits for the span base address base.
   344  func markBitsForSpan(base uintptr) (mbits markBits) {
   345  	if base < mheap_.arena_start || base >= mheap_.arena_used {
   346  		throw("markBitsForSpan: base out of range")
   347  	}
   348  	mbits = markBitsForAddr(base)
   349  	if mbits.mask != 1 {
   350  		throw("markBitsForSpan: unaligned start")
   351  	}
   352  	return mbits
   353  }
   354  
   355  // advance advances the markBits to the next object in the span.
   356  func (m *markBits) advance() {
   357  	if m.mask == 1<<7 {
   358  		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
   359  		m.mask = 1
   360  	} else {
   361  		m.mask = m.mask << 1
   362  	}
   363  	m.index++
   364  }
   365  
   366  // heapBitsForAddr returns the heapBits for the address addr.
   367  // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
   368  //
   369  // nosplit because it is used during write barriers and must not be preempted.
   370  //go:nosplit
   371  func heapBitsForAddr(addr uintptr) heapBits {
   372  	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
   373  	off := (addr - mheap_.arena_start) / sys.PtrSize
   374  	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
   375  }
   376  
   377  // heapBitsForSpan returns the heapBits for the span base address base.
   378  func heapBitsForSpan(base uintptr) (hbits heapBits) {
   379  	if base < mheap_.arena_start || base >= mheap_.arena_used {
   380  		print("runtime: base ", hex(base), " not in range [", hex(mheap_.arena_start), ",", hex(mheap_.arena_used), ")\n")
   381  		throw("heapBitsForSpan: base out of range")
   382  	}
   383  	return heapBitsForAddr(base)
   384  }
   385  
   386  // heapBitsForObject returns the base address for the heap object
   387  // containing the address p, the heapBits for base,
   388  // the object's span, and of the index of the object in s.
   389  // If p does not point into a heap object,
   390  // return base == 0
   391  // otherwise return the base of the object.
   392  //
   393  // refBase and refOff optionally give the base address of the object
   394  // in which the pointer p was found and the byte offset at which it
   395  // was found. These are used for error reporting.
   396  func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
   397  	arenaStart := mheap_.arena_start
   398  	if p < arenaStart || p >= mheap_.arena_used {
   399  		return
   400  	}
   401  	off := p - arenaStart
   402  	idx := off >> _PageShift
   403  	// p points into the heap, but possibly to the middle of an object.
   404  	// Consult the span table to find the block beginning.
   405  	s = mheap_.spans[idx]
   406  	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
   407  		if s == nil || s.state == _MSpanManual {
   408  			// If s is nil, the virtual address has never been part of the heap.
   409  			// This pointer may be to some mmap'd region, so we allow it.
   410  			// Pointers into stacks are also ok, the runtime manages these explicitly.
   411  			return
   412  		}
   413  
   414  		// The following ensures that we are rigorous about what data
   415  		// structures hold valid pointers.
   416  		if debug.invalidptr != 0 {
   417  			// Typically this indicates an incorrect use
   418  			// of unsafe or cgo to store a bad pointer in
   419  			// the Go heap. It may also indicate a runtime
   420  			// bug.
   421  			//
   422  			// TODO(austin): We could be more aggressive
   423  			// and detect pointers to unallocated objects
   424  			// in allocated spans.
   425  			printlock()
   426  			print("runtime: pointer ", hex(p))
   427  			if s.state != mSpanInUse {
   428  				print(" to unallocated span")
   429  			} else {
   430  				print(" to unused region of span")
   431  			}
   432  			print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
   433  			if refBase != 0 {
   434  				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
   435  				gcDumpObject("object", refBase, refOff)
   436  			}
   437  			getg().m.traceback = 2
   438  			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
   439  		}
   440  		return
   441  	}
   442  	// If this span holds object of a power of 2 size, just mask off the bits to
   443  	// the interior of the object. Otherwise use the size to get the base.
   444  	if s.baseMask != 0 {
   445  		// optimize for power of 2 sized objects.
   446  		base = s.base()
   447  		base = base + (p-base)&uintptr(s.baseMask)
   448  		objIndex = (base - s.base()) >> s.divShift
   449  		// base = p & s.baseMask is faster for small spans,
   450  		// but doesn't work for large spans.
   451  		// Overall, it's faster to use the more general computation above.
   452  	} else {
   453  		base = s.base()
   454  		if p-base >= s.elemsize {
   455  			// n := (p - base) / s.elemsize, using division by multiplication
   456  			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
   457  			base += objIndex * s.elemsize
   458  		}
   459  	}
   460  	// Now that we know the actual base, compute heapBits to return to caller.
   461  	hbits = heapBitsForAddr(base)
   462  	return
   463  }
   464  
   465  // prefetch the bits.
   466  func (h heapBits) prefetch() {
   467  	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
   468  }
   469  
   470  // next returns the heapBits describing the next pointer-sized word in memory.
   471  // That is, if h describes address p, h.next() describes p+ptrSize.
   472  // Note that next does not modify h. The caller must record the result.
   473  //
   474  // nosplit because it is used during write barriers and must not be preempted.
   475  //go:nosplit
   476  func (h heapBits) next() heapBits {
   477  	if h.shift < 3*heapBitsShift {
   478  		return heapBits{h.bitp, h.shift + heapBitsShift}
   479  	}
   480  	return heapBits{subtract1(h.bitp), 0}
   481  }
   482  
   483  // forward returns the heapBits describing n pointer-sized words ahead of h in memory.
   484  // That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
   485  // h.forward(1) is equivalent to h.next(), just slower.
   486  // Note that forward does not modify h. The caller must record the result.
   487  // bits returns the heap bits for the current word.
   488  func (h heapBits) forward(n uintptr) heapBits {
   489  	n += uintptr(h.shift) / heapBitsShift
   490  	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
   491  }
   492  
   493  // The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
   494  // The result includes in its higher bits the bits for subsequent words
   495  // described by the same bitmap byte.
   496  func (h heapBits) bits() uint32 {
   497  	// The (shift & 31) eliminates a test and conditional branch
   498  	// from the generated code.
   499  	return uint32(*h.bitp) >> (h.shift & 31)
   500  }
   501  
   502  // morePointers returns true if this word and all remaining words in this object
   503  // are scalars.
   504  // h must not describe the second word of the object.
   505  func (h heapBits) morePointers() bool {
   506  	return h.bits()&bitScan != 0
   507  }
   508  
   509  // isPointer reports whether the heap bits describe a pointer word.
   510  //
   511  // nosplit because it is used during write barriers and must not be preempted.
   512  //go:nosplit
   513  func (h heapBits) isPointer() bool {
   514  	return h.bits()&bitPointer != 0
   515  }
   516  
   517  // isCheckmarked reports whether the heap bits have the checkmarked bit set.
   518  // It must be told how large the object at h is, because the encoding of the
   519  // checkmark bit varies by size.
   520  // h must describe the initial word of the object.
   521  func (h heapBits) isCheckmarked(size uintptr) bool {
   522  	if size == sys.PtrSize {
   523  		return (*h.bitp>>h.shift)&bitPointer != 0
   524  	}
   525  	// All multiword objects are 2-word aligned,
   526  	// so we know that the initial word's 2-bit pair
   527  	// and the second word's 2-bit pair are in the
   528  	// same heap bitmap byte, *h.bitp.
   529  	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
   530  }
   531  
   532  // setCheckmarked sets the checkmarked bit.
   533  // It must be told how large the object at h is, because the encoding of the
   534  // checkmark bit varies by size.
   535  // h must describe the initial word of the object.
   536  func (h heapBits) setCheckmarked(size uintptr) {
   537  	if size == sys.PtrSize {
   538  		atomic.Or8(h.bitp, bitPointer<<h.shift)
   539  		return
   540  	}
   541  	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
   542  }
   543  
   544  // bulkBarrierPreWrite executes writebarrierptr_prewrite1
   545  // for every pointer slot in the memory range [src, src+size),
   546  // using pointer/scalar information from [dst, dst+size).
   547  // This executes the write barriers necessary before a memmove.
   548  // src, dst, and size must be pointer-aligned.
   549  // The range [dst, dst+size) must lie within a single object.
   550  //
   551  // As a special case, src == 0 indicates that this is being used for a
   552  // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
   553  // barrier.
   554  //
   555  // Callers should call bulkBarrierPreWrite immediately before
   556  // calling memmove(dst, src, size). This function is marked nosplit
   557  // to avoid being preempted; the GC must not stop the goroutine
   558  // between the memmove and the execution of the barriers.
   559  // The caller is also responsible for cgo pointer checks if this
   560  // may be writing Go pointers into non-Go memory.
   561  //
   562  // The pointer bitmap is not maintained for allocations containing
   563  // no pointers at all; any caller of bulkBarrierPreWrite must first
   564  // make sure the underlying allocation contains pointers, usually
   565  // by checking typ.kind&kindNoPointers.
   566  //
   567  //go:nosplit
   568  func bulkBarrierPreWrite(dst, src, size uintptr) {
   569  	if (dst|src|size)&(sys.PtrSize-1) != 0 {
   570  		throw("bulkBarrierPreWrite: unaligned arguments")
   571  	}
   572  	if !writeBarrier.needed {
   573  		return
   574  	}
   575  	if !inheap(dst) {
   576  		gp := getg().m.curg
   577  		if gp != nil && gp.stack.lo <= dst && dst < gp.stack.hi {
   578  			// Destination is our own stack. No need for barriers.
   579  			return
   580  		}
   581  
   582  		// If dst is a global, use the data or BSS bitmaps to
   583  		// execute write barriers.
   584  		for _, datap := range activeModules() {
   585  			if datap.data <= dst && dst < datap.edata {
   586  				bulkBarrierBitmap(dst, src, size, dst-datap.data, datap.gcdatamask.bytedata)
   587  				return
   588  			}
   589  		}
   590  		for _, datap := range activeModules() {
   591  			if datap.bss <= dst && dst < datap.ebss {
   592  				bulkBarrierBitmap(dst, src, size, dst-datap.bss, datap.gcbssmask.bytedata)
   593  				return
   594  			}
   595  		}
   596  		return
   597  	}
   598  
   599  	h := heapBitsForAddr(dst)
   600  	if src == 0 {
   601  		for i := uintptr(0); i < size; i += sys.PtrSize {
   602  			if h.isPointer() {
   603  				dstx := (*uintptr)(unsafe.Pointer(dst + i))
   604  				writebarrierptr_prewrite1(dstx, 0)
   605  			}
   606  			h = h.next()
   607  		}
   608  	} else {
   609  		for i := uintptr(0); i < size; i += sys.PtrSize {
   610  			if h.isPointer() {
   611  				dstx := (*uintptr)(unsafe.Pointer(dst + i))
   612  				srcx := (*uintptr)(unsafe.Pointer(src + i))
   613  				writebarrierptr_prewrite1(dstx, *srcx)
   614  			}
   615  			h = h.next()
   616  		}
   617  	}
   618  }
   619  
   620  // bulkBarrierBitmap executes write barriers for copying from [src,
   621  // src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
   622  // assumed to start maskOffset bytes into the data covered by the
   623  // bitmap in bits (which may not be a multiple of 8).
   624  //
   625  // This is used by bulkBarrierPreWrite for writes to data and BSS.
   626  //
   627  //go:nosplit
   628  func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
   629  	word := maskOffset / sys.PtrSize
   630  	bits = addb(bits, word/8)
   631  	mask := uint8(1) << (word % 8)
   632  
   633  	for i := uintptr(0); i < size; i += sys.PtrSize {
   634  		if mask == 0 {
   635  			bits = addb(bits, 1)
   636  			if *bits == 0 {
   637  				// Skip 8 words.
   638  				i += 7 * sys.PtrSize
   639  				continue
   640  			}
   641  			mask = 1
   642  		}
   643  		if *bits&mask != 0 {
   644  			dstx := (*uintptr)(unsafe.Pointer(dst + i))
   645  			if src == 0 {
   646  				writebarrierptr_prewrite1(dstx, 0)
   647  			} else {
   648  				srcx := (*uintptr)(unsafe.Pointer(src + i))
   649  				writebarrierptr_prewrite1(dstx, *srcx)
   650  			}
   651  		}
   652  		mask <<= 1
   653  	}
   654  }
   655  
   656  // typeBitsBulkBarrier executes writebarrierptr_prewrite for every
   657  // pointer that would be copied from [src, src+size) to [dst,
   658  // dst+size) by a memmove using the type bitmap to locate those
   659  // pointer slots.
   660  //
   661  // The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
   662  // dst, src, and size must be pointer-aligned.
   663  // The type typ must have a plain bitmap, not a GC program.
   664  // The only use of this function is in channel sends, and the
   665  // 64 kB channel element limit takes care of this for us.
   666  //
   667  // Must not be preempted because it typically runs right before memmove,
   668  // and the GC must observe them as an atomic action.
   669  //
   670  //go:nosplit
   671  func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
   672  	if typ == nil {
   673  		throw("runtime: typeBitsBulkBarrier without type")
   674  	}
   675  	if typ.size != size {
   676  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
   677  		throw("runtime: invalid typeBitsBulkBarrier")
   678  	}
   679  	if typ.kind&kindGCProg != 0 {
   680  		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
   681  		throw("runtime: invalid typeBitsBulkBarrier")
   682  	}
   683  	if !writeBarrier.needed {
   684  		return
   685  	}
   686  	ptrmask := typ.gcdata
   687  	var bits uint32
   688  	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
   689  		if i&(sys.PtrSize*8-1) == 0 {
   690  			bits = uint32(*ptrmask)
   691  			ptrmask = addb(ptrmask, 1)
   692  		} else {
   693  			bits = bits >> 1
   694  		}
   695  		if bits&1 != 0 {
   696  			dstx := (*uintptr)(unsafe.Pointer(dst + i))
   697  			srcx := (*uintptr)(unsafe.Pointer(src + i))
   698  			writebarrierptr_prewrite(dstx, *srcx)
   699  		}
   700  	}
   701  }
   702  
   703  // The methods operating on spans all require that h has been returned
   704  // by heapBitsForSpan and that size, n, total are the span layout description
   705  // returned by the mspan's layout method.
   706  // If total > size*n, it means that there is extra leftover memory in the span,
   707  // usually due to rounding.
   708  //
   709  // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
   710  
   711  // initSpan initializes the heap bitmap for a span.
   712  // It clears all checkmark bits.
   713  // If this is a span of pointer-sized objects, it initializes all
   714  // words to pointer/scan.
   715  // Otherwise, it initializes all words to scalar/dead.
   716  func (h heapBits) initSpan(s *mspan) {
   717  	size, n, total := s.layout()
   718  
   719  	// Init the markbit structures
   720  	s.freeindex = 0
   721  	s.allocCache = ^uint64(0) // all 1s indicating all free.
   722  	s.nelems = n
   723  	s.allocBits = nil
   724  	s.gcmarkBits = nil
   725  	s.gcmarkBits = newMarkBits(s.nelems)
   726  	s.allocBits = newAllocBits(s.nelems)
   727  
   728  	// Clear bits corresponding to objects.
   729  	if total%heapBitmapScale != 0 {
   730  		throw("initSpan: unaligned length")
   731  	}
   732  	nbyte := total / heapBitmapScale
   733  	if sys.PtrSize == 8 && size == sys.PtrSize {
   734  		end := h.bitp
   735  		bitp := subtractb(end, nbyte-1)
   736  		for {
   737  			*bitp = bitPointerAll | bitScanAll
   738  			if bitp == end {
   739  				break
   740  			}
   741  			bitp = add1(bitp)
   742  		}
   743  		return
   744  	}
   745  	memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
   746  }
   747  
   748  // initCheckmarkSpan initializes a span for being checkmarked.
   749  // It clears the checkmark bits, which are set to 1 in normal operation.
   750  func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
   751  	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
   752  	if sys.PtrSize == 8 && size == sys.PtrSize {
   753  		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
   754  		// Only possible on 64-bit system, since minimum size is 8.
   755  		// Must clear type bit (checkmark bit) of every word.
   756  		// The type bit is the lower of every two-bit pair.
   757  		bitp := h.bitp
   758  		for i := uintptr(0); i < n; i += 4 {
   759  			*bitp &^= bitPointerAll
   760  			bitp = subtract1(bitp)
   761  		}
   762  		return
   763  	}
   764  	for i := uintptr(0); i < n; i++ {
   765  		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
   766  		h = h.forward(size / sys.PtrSize)
   767  	}
   768  }
   769  
   770  // clearCheckmarkSpan undoes all the checkmarking in a span.
   771  // The actual checkmark bits are ignored, so the only work to do
   772  // is to fix the pointer bits. (Pointer bits are ignored by scanobject
   773  // but consulted by typedmemmove.)
   774  func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
   775  	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
   776  	if sys.PtrSize == 8 && size == sys.PtrSize {
   777  		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
   778  		// Only possible on 64-bit system, since minimum size is 8.
   779  		// Must clear type bit (checkmark bit) of every word.
   780  		// The type bit is the lower of every two-bit pair.
   781  		bitp := h.bitp
   782  		for i := uintptr(0); i < n; i += 4 {
   783  			*bitp |= bitPointerAll
   784  			bitp = subtract1(bitp)
   785  		}
   786  	}
   787  }
   788  
   789  // oneBitCount is indexed by byte and produces the
   790  // number of 1 bits in that byte. For example 128 has 1 bit set
   791  // and oneBitCount[128] will holds 1.
   792  var oneBitCount = [256]uint8{
   793  	0, 1, 1, 2, 1, 2, 2, 3,
   794  	1, 2, 2, 3, 2, 3, 3, 4,
   795  	1, 2, 2, 3, 2, 3, 3, 4,
   796  	2, 3, 3, 4, 3, 4, 4, 5,
   797  	1, 2, 2, 3, 2, 3, 3, 4,
   798  	2, 3, 3, 4, 3, 4, 4, 5,
   799  	2, 3, 3, 4, 3, 4, 4, 5,
   800  	3, 4, 4, 5, 4, 5, 5, 6,
   801  	1, 2, 2, 3, 2, 3, 3, 4,
   802  	2, 3, 3, 4, 3, 4, 4, 5,
   803  	2, 3, 3, 4, 3, 4, 4, 5,
   804  	3, 4, 4, 5, 4, 5, 5, 6,
   805  	2, 3, 3, 4, 3, 4, 4, 5,
   806  	3, 4, 4, 5, 4, 5, 5, 6,
   807  	3, 4, 4, 5, 4, 5, 5, 6,
   808  	4, 5, 5, 6, 5, 6, 6, 7,
   809  	1, 2, 2, 3, 2, 3, 3, 4,
   810  	2, 3, 3, 4, 3, 4, 4, 5,
   811  	2, 3, 3, 4, 3, 4, 4, 5,
   812  	3, 4, 4, 5, 4, 5, 5, 6,
   813  	2, 3, 3, 4, 3, 4, 4, 5,
   814  	3, 4, 4, 5, 4, 5, 5, 6,
   815  	3, 4, 4, 5, 4, 5, 5, 6,
   816  	4, 5, 5, 6, 5, 6, 6, 7,
   817  	2, 3, 3, 4, 3, 4, 4, 5,
   818  	3, 4, 4, 5, 4, 5, 5, 6,
   819  	3, 4, 4, 5, 4, 5, 5, 6,
   820  	4, 5, 5, 6, 5, 6, 6, 7,
   821  	3, 4, 4, 5, 4, 5, 5, 6,
   822  	4, 5, 5, 6, 5, 6, 6, 7,
   823  	4, 5, 5, 6, 5, 6, 6, 7,
   824  	5, 6, 6, 7, 6, 7, 7, 8}
   825  
   826  // countAlloc returns the number of objects allocated in span s by
   827  // scanning the allocation bitmap.
   828  // TODO:(rlh) Use popcount intrinsic.
   829  func (s *mspan) countAlloc() int {
   830  	count := 0
   831  	maxIndex := s.nelems / 8
   832  	for i := uintptr(0); i < maxIndex; i++ {
   833  		mrkBits := *s.gcmarkBits.bytep(i)
   834  		count += int(oneBitCount[mrkBits])
   835  	}
   836  	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
   837  		mrkBits := *s.gcmarkBits.bytep(maxIndex)
   838  		mask := uint8((1 << bitsInLastByte) - 1)
   839  		bits := mrkBits & mask
   840  		count += int(oneBitCount[bits])
   841  	}
   842  	return count
   843  }
   844  
   845  // heapBitsSetType records that the new allocation [x, x+size)
   846  // holds in [x, x+dataSize) one or more values of type typ.
   847  // (The number of values is given by dataSize / typ.size.)
   848  // If dataSize < size, the fragment [x+dataSize, x+size) is
   849  // recorded as non-pointer data.
   850  // It is known that the type has pointers somewhere;
   851  // malloc does not call heapBitsSetType when there are no pointers,
   852  // because all free objects are marked as noscan during
   853  // heapBitsSweepSpan.
   854  //
   855  // There can only be one allocation from a given span active at a time,
   856  // and the bitmap for a span always falls on byte boundaries,
   857  // so there are no write-write races for access to the heap bitmap.
   858  // Hence, heapBitsSetType can access the bitmap without atomics.
   859  //
   860  // There can be read-write races between heapBitsSetType and things
   861  // that read the heap bitmap like scanobject. However, since
   862  // heapBitsSetType is only used for objects that have not yet been
   863  // made reachable, readers will ignore bits being modified by this
   864  // function. This does mean this function cannot transiently modify
   865  // bits that belong to neighboring objects. Also, on weakly-ordered
   866  // machines, callers must execute a store/store (publication) barrier
   867  // between calling this function and making the object reachable.
   868  func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
   869  	const doubleCheck = false // slow but helpful; enable to test modifications to this code
   870  
   871  	// dataSize is always size rounded up to the next malloc size class,
   872  	// except in the case of allocating a defer block, in which case
   873  	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
   874  	// arbitrarily larger.
   875  	//
   876  	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
   877  	// assume that dataSize == size without checking it explicitly.
   878  
   879  	if sys.PtrSize == 8 && size == sys.PtrSize {
   880  		// It's one word and it has pointers, it must be a pointer.
   881  		// Since all allocated one-word objects are pointers
   882  		// (non-pointers are aggregated into tinySize allocations),
   883  		// initSpan sets the pointer bits for us. Nothing to do here.
   884  		if doubleCheck {
   885  			h := heapBitsForAddr(x)
   886  			if !h.isPointer() {
   887  				throw("heapBitsSetType: pointer bit missing")
   888  			}
   889  			if !h.morePointers() {
   890  				throw("heapBitsSetType: scan bit missing")
   891  			}
   892  		}
   893  		return
   894  	}
   895  
   896  	h := heapBitsForAddr(x)
   897  	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
   898  
   899  	// Heap bitmap bits for 2-word object are only 4 bits,
   900  	// so also shared with objects next to it.
   901  	// This is called out as a special case primarily for 32-bit systems,
   902  	// so that on 32-bit systems the code below can assume all objects
   903  	// are 4-word aligned (because they're all 16-byte aligned).
   904  	if size == 2*sys.PtrSize {
   905  		if typ.size == sys.PtrSize {
   906  			// We're allocating a block big enough to hold two pointers.
   907  			// On 64-bit, that means the actual object must be two pointers,
   908  			// or else we'd have used the one-pointer-sized block.
   909  			// On 32-bit, however, this is the 8-byte block, the smallest one.
   910  			// So it could be that we're allocating one pointer and this was
   911  			// just the smallest block available. Distinguish by checking dataSize.
   912  			// (In general the number of instances of typ being allocated is
   913  			// dataSize/typ.size.)
   914  			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
   915  				// 1 pointer object. On 32-bit machines clear the bit for the
   916  				// unused second word.
   917  				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
   918  				*h.bitp |= (bitPointer | bitScan) << h.shift
   919  			} else {
   920  				// 2-element slice of pointer.
   921  				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
   922  			}
   923  			return
   924  		}
   925  		// Otherwise typ.size must be 2*sys.PtrSize,
   926  		// and typ.kind&kindGCProg == 0.
   927  		if doubleCheck {
   928  			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
   929  				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
   930  				throw("heapBitsSetType")
   931  			}
   932  		}
   933  		b := uint32(*ptrmask)
   934  		hb := (b & 3) | bitScan
   935  		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
   936  		// 110011 is shifted h.shift and complemented.
   937  		// This clears out the bits that are about to be
   938  		// ored into *h.hbitp in the next instructions.
   939  		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
   940  		*h.bitp |= uint8(hb << h.shift)
   941  		return
   942  	}
   943  
   944  	// Copy from 1-bit ptrmask into 2-bit bitmap.
   945  	// The basic approach is to use a single uintptr as a bit buffer,
   946  	// alternating between reloading the buffer and writing bitmap bytes.
   947  	// In general, one load can supply two bitmap byte writes.
   948  	// This is a lot of lines of code, but it compiles into relatively few
   949  	// machine instructions.
   950  
   951  	var (
   952  		// Ptrmask input.
   953  		p     *byte   // last ptrmask byte read
   954  		b     uintptr // ptrmask bits already loaded
   955  		nb    uintptr // number of bits in b at next read
   956  		endp  *byte   // final ptrmask byte to read (then repeat)
   957  		endnb uintptr // number of valid bits in *endp
   958  		pbits uintptr // alternate source of bits
   959  
   960  		// Heap bitmap output.
   961  		w     uintptr // words processed
   962  		nw    uintptr // number of words to process
   963  		hbitp *byte   // next heap bitmap byte to write
   964  		hb    uintptr // bits being prepared for *hbitp
   965  	)
   966  
   967  	hbitp = h.bitp
   968  
   969  	// Handle GC program. Delayed until this part of the code
   970  	// so that we can use the same double-checking mechanism
   971  	// as the 1-bit case. Nothing above could have encountered
   972  	// GC programs: the cases were all too small.
   973  	if typ.kind&kindGCProg != 0 {
   974  		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
   975  		if doubleCheck {
   976  			// Double-check the heap bits written by GC program
   977  			// by running the GC program to create a 1-bit pointer mask
   978  			// and then jumping to the double-check code below.
   979  			// This doesn't catch bugs shared between the 1-bit and 4-bit
   980  			// GC program execution, but it does catch mistakes specific
   981  			// to just one of those and bugs in heapBitsSetTypeGCProg's
   982  			// implementation of arrays.
   983  			lock(&debugPtrmask.lock)
   984  			if debugPtrmask.data == nil {
   985  				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
   986  			}
   987  			ptrmask = debugPtrmask.data
   988  			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
   989  			goto Phase4
   990  		}
   991  		return
   992  	}
   993  
   994  	// Note about sizes:
   995  	//
   996  	// typ.size is the number of words in the object,
   997  	// and typ.ptrdata is the number of words in the prefix
   998  	// of the object that contains pointers. That is, the final
   999  	// typ.size - typ.ptrdata words contain no pointers.
  1000  	// This allows optimization of a common pattern where
  1001  	// an object has a small header followed by a large scalar
  1002  	// buffer. If we know the pointers are over, we don't have
  1003  	// to scan the buffer's heap bitmap at all.
  1004  	// The 1-bit ptrmasks are sized to contain only bits for
  1005  	// the typ.ptrdata prefix, zero padded out to a full byte
  1006  	// of bitmap. This code sets nw (below) so that heap bitmap
  1007  	// bits are only written for the typ.ptrdata prefix; if there is
  1008  	// more room in the allocated object, the next heap bitmap
  1009  	// entry is a 00, indicating that there are no more pointers
  1010  	// to scan. So only the ptrmask for the ptrdata bytes is needed.
  1011  	//
  1012  	// Replicated copies are not as nice: if there is an array of
  1013  	// objects with scalar tails, all but the last tail does have to
  1014  	// be initialized, because there is no way to say "skip forward".
  1015  	// However, because of the possibility of a repeated type with
  1016  	// size not a multiple of 4 pointers (one heap bitmap byte),
  1017  	// the code already must handle the last ptrmask byte specially
  1018  	// by treating it as containing only the bits for endnb pointers,
  1019  	// where endnb <= 4. We represent large scalar tails that must
  1020  	// be expanded in the replication by setting endnb larger than 4.
  1021  	// This will have the effect of reading many bits out of b,
  1022  	// but once the real bits are shifted out, b will supply as many
  1023  	// zero bits as we try to read, which is exactly what we need.
  1024  
  1025  	p = ptrmask
  1026  	if typ.size < dataSize {
  1027  		// Filling in bits for an array of typ.
  1028  		// Set up for repetition of ptrmask during main loop.
  1029  		// Note that ptrmask describes only a prefix of
  1030  		const maxBits = sys.PtrSize*8 - 7
  1031  		if typ.ptrdata/sys.PtrSize <= maxBits {
  1032  			// Entire ptrmask fits in uintptr with room for a byte fragment.
  1033  			// Load into pbits and never read from ptrmask again.
  1034  			// This is especially important when the ptrmask has
  1035  			// fewer than 8 bits in it; otherwise the reload in the middle
  1036  			// of the Phase 2 loop would itself need to loop to gather
  1037  			// at least 8 bits.
  1038  
  1039  			// Accumulate ptrmask into b.
  1040  			// ptrmask is sized to describe only typ.ptrdata, but we record
  1041  			// it as describing typ.size bytes, since all the high bits are zero.
  1042  			nb = typ.ptrdata / sys.PtrSize
  1043  			for i := uintptr(0); i < nb; i += 8 {
  1044  				b |= uintptr(*p) << i
  1045  				p = add1(p)
  1046  			}
  1047  			nb = typ.size / sys.PtrSize
  1048  
  1049  			// Replicate ptrmask to fill entire pbits uintptr.
  1050  			// Doubling and truncating is fewer steps than
  1051  			// iterating by nb each time. (nb could be 1.)
  1052  			// Since we loaded typ.ptrdata/sys.PtrSize bits
  1053  			// but are pretending to have typ.size/sys.PtrSize,
  1054  			// there might be no replication necessary/possible.
  1055  			pbits = b
  1056  			endnb = nb
  1057  			if nb+nb <= maxBits {
  1058  				for endnb <= sys.PtrSize*8 {
  1059  					pbits |= pbits << endnb
  1060  					endnb += endnb
  1061  				}
  1062  				// Truncate to a multiple of original ptrmask.
  1063  				// Because nb+nb <= maxBits, nb fits in a byte.
  1064  				// Byte division is cheaper than uintptr division.
  1065  				endnb = uintptr(maxBits/byte(nb)) * nb
  1066  				pbits &= 1<<endnb - 1
  1067  				b = pbits
  1068  				nb = endnb
  1069  			}
  1070  
  1071  			// Clear p and endp as sentinel for using pbits.
  1072  			// Checked during Phase 2 loop.
  1073  			p = nil
  1074  			endp = nil
  1075  		} else {
  1076  			// Ptrmask is larger. Read it multiple times.
  1077  			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
  1078  			endp = addb(ptrmask, n)
  1079  			endnb = typ.size/sys.PtrSize - n*8
  1080  		}
  1081  	}
  1082  	if p != nil {
  1083  		b = uintptr(*p)
  1084  		p = add1(p)
  1085  		nb = 8
  1086  	}
  1087  
  1088  	if typ.size == dataSize {
  1089  		// Single entry: can stop once we reach the non-pointer data.
  1090  		nw = typ.ptrdata / sys.PtrSize
  1091  	} else {
  1092  		// Repeated instances of typ in an array.
  1093  		// Have to process first N-1 entries in full, but can stop
  1094  		// once we reach the non-pointer data in the final entry.
  1095  		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
  1096  	}
  1097  	if nw == 0 {
  1098  		// No pointers! Caller was supposed to check.
  1099  		println("runtime: invalid type ", typ.string())
  1100  		throw("heapBitsSetType: called with non-pointer type")
  1101  		return
  1102  	}
  1103  	if nw < 2 {
  1104  		// Must write at least 2 words, because the "no scan"
  1105  		// encoding doesn't take effect until the third word.
  1106  		nw = 2
  1107  	}
  1108  
  1109  	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
  1110  	// The leading byte is special because it contains the bits for word 1,
  1111  	// which does not have the scan bit set.
  1112  	// The leading half-byte is special because it's a half a byte,
  1113  	// so we have to be careful with the bits already there.
  1114  	switch {
  1115  	default:
  1116  		throw("heapBitsSetType: unexpected shift")
  1117  
  1118  	case h.shift == 0:
  1119  		// Ptrmask and heap bitmap are aligned.
  1120  		// Handle first byte of bitmap specially.
  1121  		//
  1122  		// The first byte we write out covers the first four
  1123  		// words of the object. The scan/dead bit on the first
  1124  		// word must be set to scan since there are pointers
  1125  		// somewhere in the object. The scan/dead bit on the
  1126  		// second word is the checkmark, so we don't set it.
  1127  		// In all following words, we set the scan/dead
  1128  		// appropriately to indicate that the object contains
  1129  		// to the next 2-bit entry in the bitmap.
  1130  		//
  1131  		// TODO: It doesn't matter if we set the checkmark, so
  1132  		// maybe this case isn't needed any more.
  1133  		hb = b & bitPointerAll
  1134  		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
  1135  		if w += 4; w >= nw {
  1136  			goto Phase3
  1137  		}
  1138  		*hbitp = uint8(hb)
  1139  		hbitp = subtract1(hbitp)
  1140  		b >>= 4
  1141  		nb -= 4
  1142  
  1143  	case sys.PtrSize == 8 && h.shift == 2:
  1144  		// Ptrmask and heap bitmap are misaligned.
  1145  		// The bits for the first two words are in a byte shared
  1146  		// with another object, so we must be careful with the bits
  1147  		// already there.
  1148  		// We took care of 1-word and 2-word objects above,
  1149  		// so this is at least a 6-word object.
  1150  		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
  1151  		// This is not noscan, so set the scan bit in the
  1152  		// first word.
  1153  		hb |= bitScan << (2 * heapBitsShift)
  1154  		b >>= 2
  1155  		nb -= 2
  1156  		// Note: no bitScan for second word because that's
  1157  		// the checkmark.
  1158  		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
  1159  		*hbitp |= uint8(hb)
  1160  		hbitp = subtract1(hbitp)
  1161  		if w += 2; w >= nw {
  1162  			// We know that there is more data, because we handled 2-word objects above.
  1163  			// This must be at least a 6-word object. If we're out of pointer words,
  1164  			// mark no scan in next bitmap byte and finish.
  1165  			hb = 0
  1166  			w += 4
  1167  			goto Phase3
  1168  		}
  1169  	}
  1170  
  1171  	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
  1172  	// The loop computes the bits for that last write but does not execute the write;
  1173  	// it leaves the bits in hb for processing by phase 3.
  1174  	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
  1175  	// use in the first half of the loop right now, and then we only adjust nb explicitly
  1176  	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
  1177  	nb -= 4
  1178  	for {
  1179  		// Emit bitmap byte.
  1180  		// b has at least nb+4 bits, with one exception:
  1181  		// if w+4 >= nw, then b has only nw-w bits,
  1182  		// but we'll stop at the break and then truncate
  1183  		// appropriately in Phase 3.
  1184  		hb = b & bitPointerAll
  1185  		hb |= bitScanAll
  1186  		if w += 4; w >= nw {
  1187  			break
  1188  		}
  1189  		*hbitp = uint8(hb)
  1190  		hbitp = subtract1(hbitp)
  1191  		b >>= 4
  1192  
  1193  		// Load more bits. b has nb right now.
  1194  		if p != endp {
  1195  			// Fast path: keep reading from ptrmask.
  1196  			// nb unmodified: we just loaded 8 bits,
  1197  			// and the next iteration will consume 8 bits,
  1198  			// leaving us with the same nb the next time we're here.
  1199  			if nb < 8 {
  1200  				b |= uintptr(*p) << nb
  1201  				p = add1(p)
  1202  			} else {
  1203  				// Reduce the number of bits in b.
  1204  				// This is important if we skipped
  1205  				// over a scalar tail, since nb could
  1206  				// be larger than the bit width of b.
  1207  				nb -= 8
  1208  			}
  1209  		} else if p == nil {
  1210  			// Almost as fast path: track bit count and refill from pbits.
  1211  			// For short repetitions.
  1212  			if nb < 8 {
  1213  				b |= pbits << nb
  1214  				nb += endnb
  1215  			}
  1216  			nb -= 8 // for next iteration
  1217  		} else {
  1218  			// Slow path: reached end of ptrmask.
  1219  			// Process final partial byte and rewind to start.
  1220  			b |= uintptr(*p) << nb
  1221  			nb += endnb
  1222  			if nb < 8 {
  1223  				b |= uintptr(*ptrmask) << nb
  1224  				p = add1(ptrmask)
  1225  			} else {
  1226  				nb -= 8
  1227  				p = ptrmask
  1228  			}
  1229  		}
  1230  
  1231  		// Emit bitmap byte.
  1232  		hb = b & bitPointerAll
  1233  		hb |= bitScanAll
  1234  		if w += 4; w >= nw {
  1235  			break
  1236  		}
  1237  		*hbitp = uint8(hb)
  1238  		hbitp = subtract1(hbitp)
  1239  		b >>= 4
  1240  	}
  1241  
  1242  Phase3:
  1243  	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
  1244  	if w > nw {
  1245  		// Counting the 4 entries in hb not yet written to memory,
  1246  		// there are more entries than possible pointer slots.
  1247  		// Discard the excess entries (can't be more than 3).
  1248  		mask := uintptr(1)<<(4-(w-nw)) - 1
  1249  		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
  1250  	}
  1251  
  1252  	// Change nw from counting possibly-pointer words to total words in allocation.
  1253  	nw = size / sys.PtrSize
  1254  
  1255  	// Write whole bitmap bytes.
  1256  	// The first is hb, the rest are zero.
  1257  	if w <= nw {
  1258  		*hbitp = uint8(hb)
  1259  		hbitp = subtract1(hbitp)
  1260  		hb = 0 // for possible final half-byte below
  1261  		for w += 4; w <= nw; w += 4 {
  1262  			*hbitp = 0
  1263  			hbitp = subtract1(hbitp)
  1264  		}
  1265  	}
  1266  
  1267  	// Write final partial bitmap byte if any.
  1268  	// We know w > nw, or else we'd still be in the loop above.
  1269  	// It can be bigger only due to the 4 entries in hb that it counts.
  1270  	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
  1271  	// and can discard the 4 sitting in hb.
  1272  	// But if w == nw+2, we need to write first two in hb.
  1273  	// The byte is shared with the next object, so be careful with
  1274  	// existing bits.
  1275  	if w == nw+2 {
  1276  		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
  1277  	}
  1278  
  1279  Phase4:
  1280  	// Phase 4: all done, but perhaps double check.
  1281  	if doubleCheck {
  1282  		end := heapBitsForAddr(x + size)
  1283  		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
  1284  			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
  1285  			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
  1286  			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
  1287  			h0 := heapBitsForAddr(x)
  1288  			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
  1289  			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
  1290  			throw("bad heapBitsSetType")
  1291  		}
  1292  
  1293  		// Double-check that bits to be written were written correctly.
  1294  		// Does not check that other bits were not written, unfortunately.
  1295  		h := heapBitsForAddr(x)
  1296  		nptr := typ.ptrdata / sys.PtrSize
  1297  		ndata := typ.size / sys.PtrSize
  1298  		count := dataSize / typ.size
  1299  		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
  1300  		for i := uintptr(0); i < size/sys.PtrSize; i++ {
  1301  			j := i % ndata
  1302  			var have, want uint8
  1303  			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
  1304  			if i >= totalptr {
  1305  				want = 0 // deadmarker
  1306  				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
  1307  					want = bitScan
  1308  				}
  1309  			} else {
  1310  				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
  1311  					want |= bitPointer
  1312  				}
  1313  				if i != 1 {
  1314  					want |= bitScan
  1315  				} else {
  1316  					have &^= bitScan
  1317  				}
  1318  			}
  1319  			if have != want {
  1320  				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
  1321  				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
  1322  				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
  1323  				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
  1324  				h0 := heapBitsForAddr(x)
  1325  				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
  1326  				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
  1327  				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
  1328  				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
  1329  				if typ.kind&kindGCProg != 0 {
  1330  					println("GC program:")
  1331  					dumpGCProg(addb(typ.gcdata, 4))
  1332  				}
  1333  				throw("bad heapBitsSetType")
  1334  			}
  1335  			h = h.next()
  1336  		}
  1337  		if ptrmask == debugPtrmask.data {
  1338  			unlock(&debugPtrmask.lock)
  1339  		}
  1340  	}
  1341  }
  1342  
  1343  var debugPtrmask struct {
  1344  	lock mutex
  1345  	data *byte
  1346  }
  1347  
  1348  // heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
  1349  // progSize is the size of the memory described by the program.
  1350  // elemSize is the size of the element that the GC program describes (a prefix of).
  1351  // dataSize is the total size of the intended data, a multiple of elemSize.
  1352  // allocSize is the total size of the allocated memory.
  1353  //
  1354  // GC programs are only used for large allocations.
  1355  // heapBitsSetType requires that allocSize is a multiple of 4 words,
  1356  // so that the relevant bitmap bytes are not shared with surrounding
  1357  // objects.
  1358  func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
  1359  	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
  1360  		// Alignment will be wrong.
  1361  		throw("heapBitsSetTypeGCProg: small allocation")
  1362  	}
  1363  	var totalBits uintptr
  1364  	if elemSize == dataSize {
  1365  		totalBits = runGCProg(prog, nil, h.bitp, 2)
  1366  		if totalBits*sys.PtrSize != progSize {
  1367  			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
  1368  			throw("heapBitsSetTypeGCProg: unexpected bit count")
  1369  		}
  1370  	} else {
  1371  		count := dataSize / elemSize
  1372  
  1373  		// Piece together program trailer to run after prog that does:
  1374  		//	literal(0)
  1375  		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
  1376  		//	repeat(elemSize, count-1) // repeat that element for count
  1377  		// This zero-pads the data remaining in the first element and then
  1378  		// repeats that first element to fill the array.
  1379  		var trailer [40]byte // 3 varints (max 10 each) + some bytes
  1380  		i := 0
  1381  		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
  1382  			// literal(0)
  1383  			trailer[i] = 0x01
  1384  			i++
  1385  			trailer[i] = 0
  1386  			i++
  1387  			if n > 1 {
  1388  				// repeat(1, n-1)
  1389  				trailer[i] = 0x81
  1390  				i++
  1391  				n--
  1392  				for ; n >= 0x80; n >>= 7 {
  1393  					trailer[i] = byte(n | 0x80)
  1394  					i++
  1395  				}
  1396  				trailer[i] = byte(n)
  1397  				i++
  1398  			}
  1399  		}
  1400  		// repeat(elemSize/ptrSize, count-1)
  1401  		trailer[i] = 0x80
  1402  		i++
  1403  		n := elemSize / sys.PtrSize
  1404  		for ; n >= 0x80; n >>= 7 {
  1405  			trailer[i] = byte(n | 0x80)
  1406  			i++
  1407  		}
  1408  		trailer[i] = byte(n)
  1409  		i++
  1410  		n = count - 1
  1411  		for ; n >= 0x80; n >>= 7 {
  1412  			trailer[i] = byte(n | 0x80)
  1413  			i++
  1414  		}
  1415  		trailer[i] = byte(n)
  1416  		i++
  1417  		trailer[i] = 0
  1418  		i++
  1419  
  1420  		runGCProg(prog, &trailer[0], h.bitp, 2)
  1421  
  1422  		// Even though we filled in the full array just now,
  1423  		// record that we only filled in up to the ptrdata of the
  1424  		// last element. This will cause the code below to
  1425  		// memclr the dead section of the final array element,
  1426  		// so that scanobject can stop early in the final element.
  1427  		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
  1428  	}
  1429  	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
  1430  	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
  1431  	memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
  1432  }
  1433  
  1434  // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
  1435  // size the size of the region described by prog, in bytes.
  1436  // The resulting bitvector will have no more than size/sys.PtrSize bits.
  1437  func progToPointerMask(prog *byte, size uintptr) bitvector {
  1438  	n := (size/sys.PtrSize + 7) / 8
  1439  	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
  1440  	x[len(x)-1] = 0xa1 // overflow check sentinel
  1441  	n = runGCProg(prog, nil, &x[0], 1)
  1442  	if x[len(x)-1] != 0xa1 {
  1443  		throw("progToPointerMask: overflow")
  1444  	}
  1445  	return bitvector{int32(n), &x[0]}
  1446  }
  1447  
  1448  // Packed GC pointer bitmaps, aka GC programs.
  1449  //
  1450  // For large types containing arrays, the type information has a
  1451  // natural repetition that can be encoded to save space in the
  1452  // binary and in the memory representation of the type information.
  1453  //
  1454  // The encoding is a simple Lempel-Ziv style bytecode machine
  1455  // with the following instructions:
  1456  //
  1457  //	00000000: stop
  1458  //	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
  1459  //	10000000 n c: repeat the previous n bits c times; n, c are varints
  1460  //	1nnnnnnn c: repeat the previous n bits c times; c is a varint
  1461  
  1462  // runGCProg executes the GC program prog, and then trailer if non-nil,
  1463  // writing to dst with entries of the given size.
  1464  // If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
  1465  // If size == 2, dst is the 2-bit heap bitmap, and writes move backward
  1466  // starting at dst (because the heap bitmap does). In this case, the caller guarantees
  1467  // that only whole bytes in dst need to be written.
  1468  //
  1469  // runGCProg returns the number of 1- or 2-bit entries written to memory.
  1470  func runGCProg(prog, trailer, dst *byte, size int) uintptr {
  1471  	dstStart := dst
  1472  
  1473  	// Bits waiting to be written to memory.
  1474  	var bits uintptr
  1475  	var nbits uintptr
  1476  
  1477  	p := prog
  1478  Run:
  1479  	for {
  1480  		// Flush accumulated full bytes.
  1481  		// The rest of the loop assumes that nbits <= 7.
  1482  		for ; nbits >= 8; nbits -= 8 {
  1483  			if size == 1 {
  1484  				*dst = uint8(bits)
  1485  				dst = add1(dst)
  1486  				bits >>= 8
  1487  			} else {
  1488  				v := bits&bitPointerAll | bitScanAll
  1489  				*dst = uint8(v)
  1490  				dst = subtract1(dst)
  1491  				bits >>= 4
  1492  				v = bits&bitPointerAll | bitScanAll
  1493  				*dst = uint8(v)
  1494  				dst = subtract1(dst)
  1495  				bits >>= 4
  1496  			}
  1497  		}
  1498  
  1499  		// Process one instruction.
  1500  		inst := uintptr(*p)
  1501  		p = add1(p)
  1502  		n := inst & 0x7F
  1503  		if inst&0x80 == 0 {
  1504  			// Literal bits; n == 0 means end of program.
  1505  			if n == 0 {
  1506  				// Program is over; continue in trailer if present.
  1507  				if trailer != nil {
  1508  					//println("trailer")
  1509  					p = trailer
  1510  					trailer = nil
  1511  					continue
  1512  				}
  1513  				//println("done")
  1514  				break Run
  1515  			}
  1516  			//println("lit", n, dst)
  1517  			nbyte := n / 8
  1518  			for i := uintptr(0); i < nbyte; i++ {
  1519  				bits |= uintptr(*p) << nbits
  1520  				p = add1(p)
  1521  				if size == 1 {
  1522  					*dst = uint8(bits)
  1523  					dst = add1(dst)
  1524  					bits >>= 8
  1525  				} else {
  1526  					v := bits&0xf | bitScanAll
  1527  					*dst = uint8(v)
  1528  					dst = subtract1(dst)
  1529  					bits >>= 4
  1530  					v = bits&0xf | bitScanAll
  1531  					*dst = uint8(v)
  1532  					dst = subtract1(dst)
  1533  					bits >>= 4
  1534  				}
  1535  			}
  1536  			if n %= 8; n > 0 {
  1537  				bits |= uintptr(*p) << nbits
  1538  				p = add1(p)
  1539  				nbits += n
  1540  			}
  1541  			continue Run
  1542  		}
  1543  
  1544  		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
  1545  		if n == 0 {
  1546  			for off := uint(0); ; off += 7 {
  1547  				x := uintptr(*p)
  1548  				p = add1(p)
  1549  				n |= (x & 0x7F) << off
  1550  				if x&0x80 == 0 {
  1551  					break
  1552  				}
  1553  			}
  1554  		}
  1555  
  1556  		// Count is encoded in a varint in the next bytes.
  1557  		c := uintptr(0)
  1558  		for off := uint(0); ; off += 7 {
  1559  			x := uintptr(*p)
  1560  			p = add1(p)
  1561  			c |= (x & 0x7F) << off
  1562  			if x&0x80 == 0 {
  1563  				break
  1564  			}
  1565  		}
  1566  		c *= n // now total number of bits to copy
  1567  
  1568  		// If the number of bits being repeated is small, load them
  1569  		// into a register and use that register for the entire loop
  1570  		// instead of repeatedly reading from memory.
  1571  		// Handling fewer than 8 bits here makes the general loop simpler.
  1572  		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
  1573  		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
  1574  		// it will not overflow.
  1575  		src := dst
  1576  		const maxBits = sys.PtrSize*8 - 7
  1577  		if n <= maxBits {
  1578  			// Start with bits in output buffer.
  1579  			pattern := bits
  1580  			npattern := nbits
  1581  
  1582  			// If we need more bits, fetch them from memory.
  1583  			if size == 1 {
  1584  				src = subtract1(src)
  1585  				for npattern < n {
  1586  					pattern <<= 8
  1587  					pattern |= uintptr(*src)
  1588  					src = subtract1(src)
  1589  					npattern += 8
  1590  				}
  1591  			} else {
  1592  				src = add1(src)
  1593  				for npattern < n {
  1594  					pattern <<= 4
  1595  					pattern |= uintptr(*src) & 0xf
  1596  					src = add1(src)
  1597  					npattern += 4
  1598  				}
  1599  			}
  1600  
  1601  			// We started with the whole bit output buffer,
  1602  			// and then we loaded bits from whole bytes.
  1603  			// Either way, we might now have too many instead of too few.
  1604  			// Discard the extra.
  1605  			if npattern > n {
  1606  				pattern >>= npattern - n
  1607  				npattern = n
  1608  			}
  1609  
  1610  			// Replicate pattern to at most maxBits.
  1611  			if npattern == 1 {
  1612  				// One bit being repeated.
  1613  				// If the bit is 1, make the pattern all 1s.
  1614  				// If the bit is 0, the pattern is already all 0s,
  1615  				// but we can claim that the number of bits
  1616  				// in the word is equal to the number we need (c),
  1617  				// because right shift of bits will zero fill.
  1618  				if pattern == 1 {
  1619  					pattern = 1<<maxBits - 1
  1620  					npattern = maxBits
  1621  				} else {
  1622  					npattern = c
  1623  				}
  1624  			} else {
  1625  				b := pattern
  1626  				nb := npattern
  1627  				if nb+nb <= maxBits {
  1628  					// Double pattern until the whole uintptr is filled.
  1629  					for nb <= sys.PtrSize*8 {
  1630  						b |= b << nb
  1631  						nb += nb
  1632  					}
  1633  					// Trim away incomplete copy of original pattern in high bits.
  1634  					// TODO(rsc): Replace with table lookup or loop on systems without divide?
  1635  					nb = maxBits / npattern * npattern
  1636  					b &= 1<<nb - 1
  1637  					pattern = b
  1638  					npattern = nb
  1639  				}
  1640  			}
  1641  
  1642  			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
  1643  			// Since pattern contains >8 bits, there will be full bytes to flush
  1644  			// on each iteration.
  1645  			for ; c >= npattern; c -= npattern {
  1646  				bits |= pattern << nbits
  1647  				nbits += npattern
  1648  				if size == 1 {
  1649  					for nbits >= 8 {
  1650  						*dst = uint8(bits)
  1651  						dst = add1(dst)
  1652  						bits >>= 8
  1653  						nbits -= 8
  1654  					}
  1655  				} else {
  1656  					for nbits >= 4 {
  1657  						*dst = uint8(bits&0xf | bitScanAll)
  1658  						dst = subtract1(dst)
  1659  						bits >>= 4
  1660  						nbits -= 4
  1661  					}
  1662  				}
  1663  			}
  1664  
  1665  			// Add final fragment to bit buffer.
  1666  			if c > 0 {
  1667  				pattern &= 1<<c - 1
  1668  				bits |= pattern << nbits
  1669  				nbits += c
  1670  			}
  1671  			continue Run
  1672  		}
  1673  
  1674  		// Repeat; n too large to fit in a register.
  1675  		// Since nbits <= 7, we know the first few bytes of repeated data
  1676  		// are already written to memory.
  1677  		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
  1678  		if size == 1 {
  1679  			// Leading src fragment.
  1680  			src = subtractb(src, (off+7)/8)
  1681  			if frag := off & 7; frag != 0 {
  1682  				bits |= uintptr(*src) >> (8 - frag) << nbits
  1683  				src = add1(src)
  1684  				nbits += frag
  1685  				c -= frag
  1686  			}
  1687  			// Main loop: load one byte, write another.
  1688  			// The bits are rotating through the bit buffer.
  1689  			for i := c / 8; i > 0; i-- {
  1690  				bits |= uintptr(*src) << nbits
  1691  				src = add1(src)
  1692  				*dst = uint8(bits)
  1693  				dst = add1(dst)
  1694  				bits >>= 8
  1695  			}
  1696  			// Final src fragment.
  1697  			if c %= 8; c > 0 {
  1698  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
  1699  				nbits += c
  1700  			}
  1701  		} else {
  1702  			// Leading src fragment.
  1703  			src = addb(src, (off+3)/4)
  1704  			if frag := off & 3; frag != 0 {
  1705  				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
  1706  				src = subtract1(src)
  1707  				nbits += frag
  1708  				c -= frag
  1709  			}
  1710  			// Main loop: load one byte, write another.
  1711  			// The bits are rotating through the bit buffer.
  1712  			for i := c / 4; i > 0; i-- {
  1713  				bits |= (uintptr(*src) & 0xf) << nbits
  1714  				src = subtract1(src)
  1715  				*dst = uint8(bits&0xf | bitScanAll)
  1716  				dst = subtract1(dst)
  1717  				bits >>= 4
  1718  			}
  1719  			// Final src fragment.
  1720  			if c %= 4; c > 0 {
  1721  				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
  1722  				nbits += c
  1723  			}
  1724  		}
  1725  	}
  1726  
  1727  	// Write any final bits out, using full-byte writes, even for the final byte.
  1728  	var totalBits uintptr
  1729  	if size == 1 {
  1730  		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
  1731  		nbits += -nbits & 7
  1732  		for ; nbits > 0; nbits -= 8 {
  1733  			*dst = uint8(bits)
  1734  			dst = add1(dst)
  1735  			bits >>= 8
  1736  		}
  1737  	} else {
  1738  		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
  1739  		nbits += -nbits & 3
  1740  		for ; nbits > 0; nbits -= 4 {
  1741  			v := bits&0xf | bitScanAll
  1742  			*dst = uint8(v)
  1743  			dst = subtract1(dst)
  1744  			bits >>= 4
  1745  		}
  1746  	}
  1747  	return totalBits
  1748  }
  1749  
  1750  func dumpGCProg(p *byte) {
  1751  	nptr := 0
  1752  	for {
  1753  		x := *p
  1754  		p = add1(p)
  1755  		if x == 0 {
  1756  			print("\t", nptr, " end\n")
  1757  			break
  1758  		}
  1759  		if x&0x80 == 0 {
  1760  			print("\t", nptr, " lit ", x, ":")
  1761  			n := int(x+7) / 8
  1762  			for i := 0; i < n; i++ {
  1763  				print(" ", hex(*p))
  1764  				p = add1(p)
  1765  			}
  1766  			print("\n")
  1767  			nptr += int(x)
  1768  		} else {
  1769  			nbit := int(x &^ 0x80)
  1770  			if nbit == 0 {
  1771  				for nb := uint(0); ; nb += 7 {
  1772  					x := *p
  1773  					p = add1(p)
  1774  					nbit |= int(x&0x7f) << nb
  1775  					if x&0x80 == 0 {
  1776  						break
  1777  					}
  1778  				}
  1779  			}
  1780  			count := 0
  1781  			for nb := uint(0); ; nb += 7 {
  1782  				x := *p
  1783  				p = add1(p)
  1784  				count |= int(x&0x7f) << nb
  1785  				if x&0x80 == 0 {
  1786  					break
  1787  				}
  1788  			}
  1789  			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
  1790  			nptr += nbit * count
  1791  		}
  1792  	}
  1793  }
  1794  
  1795  // Testing.
  1796  
  1797  func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
  1798  	target := (*stkframe)(ctxt)
  1799  	if frame.sp <= target.sp && target.sp < frame.varp {
  1800  		*target = *frame
  1801  		return false
  1802  	}
  1803  	return true
  1804  }
  1805  
  1806  // gcbits returns the GC type info for x, for testing.
  1807  // The result is the bitmap entries (0 or 1), one entry per byte.
  1808  //go:linkname reflect_gcbits reflect.gcbits
  1809  func reflect_gcbits(x interface{}) []byte {
  1810  	ret := getgcmask(x)
  1811  	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
  1812  	nptr := typ.ptrdata / sys.PtrSize
  1813  	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
  1814  		ret = ret[:len(ret)-1]
  1815  	}
  1816  	return ret
  1817  }
  1818  
  1819  // Returns GC type info for object p for testing.
  1820  func getgcmask(ep interface{}) (mask []byte) {
  1821  	e := *efaceOf(&ep)
  1822  	p := e.data
  1823  	t := e._type
  1824  	// data or bss
  1825  	for _, datap := range activeModules() {
  1826  		// data
  1827  		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
  1828  			bitmap := datap.gcdatamask.bytedata
  1829  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1830  			mask = make([]byte, n/sys.PtrSize)
  1831  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1832  				off := (uintptr(p) + i - datap.data) / sys.PtrSize
  1833  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1834  			}
  1835  			return
  1836  		}
  1837  
  1838  		// bss
  1839  		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
  1840  			bitmap := datap.gcbssmask.bytedata
  1841  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1842  			mask = make([]byte, n/sys.PtrSize)
  1843  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1844  				off := (uintptr(p) + i - datap.bss) / sys.PtrSize
  1845  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1846  			}
  1847  			return
  1848  		}
  1849  	}
  1850  
  1851  	// heap
  1852  	var n uintptr
  1853  	var base uintptr
  1854  	if mlookup(uintptr(p), &base, &n, nil) != 0 {
  1855  		mask = make([]byte, n/sys.PtrSize)
  1856  		for i := uintptr(0); i < n; i += sys.PtrSize {
  1857  			hbits := heapBitsForAddr(base + i)
  1858  			if hbits.isPointer() {
  1859  				mask[i/sys.PtrSize] = 1
  1860  			}
  1861  			if i != 1*sys.PtrSize && !hbits.morePointers() {
  1862  				mask = mask[:i/sys.PtrSize]
  1863  				break
  1864  			}
  1865  		}
  1866  		return
  1867  	}
  1868  
  1869  	// stack
  1870  	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
  1871  		var frame stkframe
  1872  		frame.sp = uintptr(p)
  1873  		_g_ := getg()
  1874  		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
  1875  		if frame.fn.valid() {
  1876  			f := frame.fn
  1877  			targetpc := frame.continpc
  1878  			if targetpc == 0 {
  1879  				return
  1880  			}
  1881  			if targetpc != f.entry {
  1882  				targetpc--
  1883  			}
  1884  			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
  1885  			if pcdata == -1 {
  1886  				return
  1887  			}
  1888  			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
  1889  			if stkmap == nil || stkmap.n <= 0 {
  1890  				return
  1891  			}
  1892  			bv := stackmapdata(stkmap, pcdata)
  1893  			size := uintptr(bv.n) * sys.PtrSize
  1894  			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
  1895  			mask = make([]byte, n/sys.PtrSize)
  1896  			for i := uintptr(0); i < n; i += sys.PtrSize {
  1897  				bitmap := bv.bytedata
  1898  				off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
  1899  				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
  1900  			}
  1901  		}
  1902  		return
  1903  	}
  1904  
  1905  	// otherwise, not something the GC knows about.
  1906  	// possibly read-only data, like malloc(0).
  1907  	// must not have pointers
  1908  	return
  1909  }