github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/interlock/hash_table.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package interlock
    15  
    16  import (
    17  	"fmt"
    18  	"hash"
    19  	"hash/fnv"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	"github.com/whtcorpsinc/errors"
    24  	"github.com/whtcorpsinc/milevadb/stochastikctx"
    25  	"github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx"
    26  	"github.com/whtcorpsinc/milevadb/types"
    27  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    28  	"github.com/whtcorpsinc/milevadb/soliton/codec"
    29  	"github.com/whtcorpsinc/milevadb/soliton/disk"
    30  	"github.com/whtcorpsinc/milevadb/soliton/memory"
    31  )
    32  
    33  const (
    34  	// estCountMaxFactor defines the factor of estCountMax with maxChunkSize.
    35  	// estCountMax is maxChunkSize * estCountMaxFactor, the maximum threshold of estCount.
    36  	// if estCount is larger than estCountMax, set estCount to estCountMax.
    37  	// Set this threshold to prevent buildSideEstCount being too large and causing a performance and memory regression.
    38  	estCountMaxFactor = 10 * 1024
    39  
    40  	// estCountMinFactor defines the factor of estCountMin with maxChunkSize.
    41  	// estCountMin is maxChunkSize * estCountMinFactor, the minimum threshold of estCount.
    42  	// If estCount is smaller than estCountMin, set estCount to 0.
    43  	// Set this threshold to prevent buildSideEstCount being too small and causing a performance regression.
    44  	estCountMinFactor = 8
    45  
    46  	// estCountDivisor defines the divisor of buildSideEstCount.
    47  	// Set this divisor to prevent buildSideEstCount being too large and causing a performance regression.
    48  	estCountDivisor = 8
    49  )
    50  
    51  // hashContext keeps the needed hash context of a EDB causet in hash join.
    52  type hashContext struct {
    53  	allTypes  []*types.FieldType
    54  	keyDefCausIdx []int
    55  	buf       []byte
    56  	hashVals  []hash.Hash64
    57  	hasNull   []bool
    58  }
    59  
    60  func (hc *hashContext) initHash(rows int) {
    61  	if hc.buf == nil {
    62  		hc.buf = make([]byte, 1)
    63  	}
    64  
    65  	if len(hc.hashVals) < rows {
    66  		hc.hasNull = make([]bool, rows)
    67  		hc.hashVals = make([]hash.Hash64, rows)
    68  		for i := 0; i < rows; i++ {
    69  			hc.hashVals[i] = fnv.New64()
    70  		}
    71  	} else {
    72  		for i := 0; i < rows; i++ {
    73  			hc.hasNull[i] = false
    74  			hc.hashVals[i].Reset()
    75  		}
    76  	}
    77  }
    78  
    79  type hashStatistic struct {
    80  	probeDefCauslision   int
    81  	buildBlockElapse time.Duration
    82  }
    83  
    84  func (s *hashStatistic) String() string {
    85  	return fmt.Sprintf("probe_defCauslision:%v, build:%v", s.probeDefCauslision, s.buildBlockElapse)
    86  }
    87  
    88  // hashEventContainer handles the rows and the hash map of a causet.
    89  type hashEventContainer struct {
    90  	sc   *stmtctx.StatementContext
    91  	hCtx *hashContext
    92  	stat hashStatistic
    93  
    94  	// hashBlock stores the map of hashKey and EventPtr
    95  	hashBlock baseHashBlock
    96  
    97  	rowContainer *chunk.EventContainer
    98  }
    99  
   100  func newHashEventContainer(sCtx stochastikctx.Context, estCount int, hCtx *hashContext) *hashEventContainer {
   101  	maxChunkSize := sCtx.GetStochastikVars().MaxChunkSize
   102  	rc := chunk.NewEventContainer(hCtx.allTypes, maxChunkSize)
   103  	c := &hashEventContainer{
   104  		sc:           sCtx.GetStochastikVars().StmtCtx,
   105  		hCtx:         hCtx,
   106  		hashBlock:    newConcurrentMapHashBlock(),
   107  		rowContainer: rc,
   108  	}
   109  	return c
   110  }
   111  
   112  // GetMatchedEventsAndPtrs get matched rows and Ptrs from probeEvent. It can be called
   113  // in multiple goroutines while each goroutine should keep its own
   114  // h and buf.
   115  func (c *hashEventContainer) GetMatchedEventsAndPtrs(probeKey uint64, probeEvent chunk.Event, hCtx *hashContext) (matched []chunk.Event, matchedPtrs []chunk.EventPtr, err error) {
   116  	innerPtrs := c.hashBlock.Get(probeKey)
   117  	if len(innerPtrs) == 0 {
   118  		return
   119  	}
   120  	matched = make([]chunk.Event, 0, len(innerPtrs))
   121  	var matchedEvent chunk.Event
   122  	matchedPtrs = make([]chunk.EventPtr, 0, len(innerPtrs))
   123  	for _, ptr := range innerPtrs {
   124  		matchedEvent, err = c.rowContainer.GetEvent(ptr)
   125  		if err != nil {
   126  			return
   127  		}
   128  		var ok bool
   129  		ok, err = c.matchJoinKey(matchedEvent, probeEvent, hCtx)
   130  		if err != nil {
   131  			return
   132  		}
   133  		if !ok {
   134  			c.stat.probeDefCauslision++
   135  			continue
   136  		}
   137  		matched = append(matched, matchedEvent)
   138  		matchedPtrs = append(matchedPtrs, ptr)
   139  	}
   140  	return
   141  }
   142  
   143  // matchJoinKey checks if join keys of buildEvent and probeEvent are logically equal.
   144  func (c *hashEventContainer) matchJoinKey(buildEvent, probeEvent chunk.Event, probeHCtx *hashContext) (ok bool, err error) {
   145  	return codec.EqualChunkEvent(c.sc,
   146  		buildEvent, c.hCtx.allTypes, c.hCtx.keyDefCausIdx,
   147  		probeEvent, probeHCtx.allTypes, probeHCtx.keyDefCausIdx)
   148  }
   149  
   150  // alreadySpilledSafeForTest indicates that records have spilled out into disk. It's thread-safe.
   151  func (c *hashEventContainer) alreadySpilledSafeForTest() bool {
   152  	return c.rowContainer.AlreadySpilledSafeForTest()
   153  }
   154  
   155  // PutChunk puts a chunk into hashEventContainer and build hash map. It's not thread-safe.
   156  // key of hash causet: hash value of key defCausumns
   157  // value of hash causet: EventPtr of the corresponded event
   158  func (c *hashEventContainer) PutChunk(chk *chunk.Chunk, ignoreNulls []bool) error {
   159  	return c.PutChunkSelected(chk, nil, ignoreNulls)
   160  }
   161  
   162  // PutChunkSelected selectively puts a chunk into hashEventContainer and build hash map. It's not thread-safe.
   163  // key of hash causet: hash value of key defCausumns
   164  // value of hash causet: EventPtr of the corresponded event
   165  func (c *hashEventContainer) PutChunkSelected(chk *chunk.Chunk, selected, ignoreNulls []bool) error {
   166  	start := time.Now()
   167  	defer func() { c.stat.buildBlockElapse += time.Since(start) }()
   168  
   169  	chkIdx := uint32(c.rowContainer.NumChunks())
   170  	err := c.rowContainer.Add(chk)
   171  	if err != nil {
   172  		return err
   173  	}
   174  	numEvents := chk.NumEvents()
   175  	c.hCtx.initHash(numEvents)
   176  
   177  	hCtx := c.hCtx
   178  	for keyIdx, defCausIdx := range c.hCtx.keyDefCausIdx {
   179  		ignoreNull := len(ignoreNulls) > keyIdx && ignoreNulls[keyIdx]
   180  		err := codec.HashChunkSelected(c.sc, hCtx.hashVals, chk, hCtx.allTypes[defCausIdx], defCausIdx, hCtx.buf, hCtx.hasNull, selected, ignoreNull)
   181  		if err != nil {
   182  			return errors.Trace(err)
   183  		}
   184  	}
   185  	for i := 0; i < numEvents; i++ {
   186  		if (selected != nil && !selected[i]) || c.hCtx.hasNull[i] {
   187  			continue
   188  		}
   189  		key := c.hCtx.hashVals[i].Sum64()
   190  		rowPtr := chunk.EventPtr{ChkIdx: chkIdx, EventIdx: uint32(i)}
   191  		c.hashBlock.Put(key, rowPtr)
   192  	}
   193  	return nil
   194  }
   195  
   196  // getJoinKeyFromChkEvent fetches join keys from event and calculate the hash value.
   197  func (*hashEventContainer) getJoinKeyFromChkEvent(sc *stmtctx.StatementContext, event chunk.Event, hCtx *hashContext) (hasNull bool, key uint64, err error) {
   198  	for _, i := range hCtx.keyDefCausIdx {
   199  		if event.IsNull(i) {
   200  			return true, 0, nil
   201  		}
   202  	}
   203  	hCtx.initHash(1)
   204  	err = codec.HashChunkEvent(sc, hCtx.hashVals[0], event, hCtx.allTypes, hCtx.keyDefCausIdx, hCtx.buf)
   205  	return false, hCtx.hashVals[0].Sum64(), err
   206  }
   207  
   208  // NumChunks returns the number of chunks in the rowContainer
   209  func (c *hashEventContainer) NumChunks() int {
   210  	return c.rowContainer.NumChunks()
   211  }
   212  
   213  // NumEventsOfChunk returns the number of rows of a chunk
   214  func (c *hashEventContainer) NumEventsOfChunk(chkID int) int {
   215  	return c.rowContainer.NumEventsOfChunk(chkID)
   216  }
   217  
   218  // GetChunk returns chkIdx th chunk of in memory records, only works if rowContainer is not spilled
   219  func (c *hashEventContainer) GetChunk(chkIdx int) (*chunk.Chunk, error) {
   220  	return c.rowContainer.GetChunk(chkIdx)
   221  }
   222  
   223  // GetEvent returns the event the ptr pointed to in the rowContainer
   224  func (c *hashEventContainer) GetEvent(ptr chunk.EventPtr) (chunk.Event, error) {
   225  	return c.rowContainer.GetEvent(ptr)
   226  }
   227  
   228  // Len returns number of records in the hash causet.
   229  func (c *hashEventContainer) Len() uint64 {
   230  	return c.hashBlock.Len()
   231  }
   232  
   233  func (c *hashEventContainer) Close() error {
   234  	return c.rowContainer.Close()
   235  }
   236  
   237  // GetMemTracker returns the underlying memory usage tracker in hashEventContainer.
   238  func (c *hashEventContainer) GetMemTracker() *memory.Tracker { return c.rowContainer.GetMemTracker() }
   239  
   240  // GetDiskTracker returns the underlying disk usage tracker in hashEventContainer.
   241  func (c *hashEventContainer) GetDiskTracker() *disk.Tracker { return c.rowContainer.GetDiskTracker() }
   242  
   243  // CausetActionSpill returns a memory.SuperCowOrNoCausetOnExceed for spilling over to disk.
   244  func (c *hashEventContainer) CausetActionSpill() memory.SuperCowOrNoCausetOnExceed {
   245  	return c.rowContainer.CausetActionSpill()
   246  }
   247  
   248  const (
   249  	initialEntrySliceLen = 64
   250  	maxEntrySliceLen     = 8192
   251  )
   252  
   253  type entry struct {
   254  	ptr  chunk.EventPtr
   255  	next *entry
   256  }
   257  
   258  type entryStore struct {
   259  	slices [][]entry
   260  	cursor int
   261  }
   262  
   263  func newEntryStore() *entryStore {
   264  	es := new(entryStore)
   265  	es.slices = [][]entry{make([]entry, initialEntrySliceLen)}
   266  	es.cursor = 0
   267  	return es
   268  }
   269  
   270  func (es *entryStore) GetStore() (e *entry) {
   271  	sliceIdx := uint32(len(es.slices) - 1)
   272  	slice := es.slices[sliceIdx]
   273  	if es.cursor >= cap(slice) {
   274  		size := cap(slice) * 2
   275  		if size >= maxEntrySliceLen {
   276  			size = maxEntrySliceLen
   277  		}
   278  		slice = make([]entry, size)
   279  		es.slices = append(es.slices, slice)
   280  		sliceIdx++
   281  		es.cursor = 0
   282  	}
   283  	e = &es.slices[sliceIdx][es.cursor]
   284  	es.cursor++
   285  	return
   286  }
   287  
   288  type baseHashBlock interface {
   289  	Put(hashKey uint64, rowPtr chunk.EventPtr)
   290  	Get(hashKey uint64) (rowPtrs []chunk.EventPtr)
   291  	Len() uint64
   292  }
   293  
   294  // TODO (fangzhuhe) remove unsafeHashBlock later if it not used anymore
   295  // unsafeHashBlock stores multiple rowPtr of rows for a given key with minimum GC overhead.
   296  // A given key can causetstore multiple values.
   297  // It is not thread-safe, should only be used in one goroutine.
   298  type unsafeHashBlock struct {
   299  	hashMap    map[uint64]*entry
   300  	entryStore *entryStore
   301  	length     uint64
   302  }
   303  
   304  // newUnsafeHashBlock creates a new unsafeHashBlock. estCount means the estimated size of the hashMap.
   305  // If unknown, set it to 0.
   306  func newUnsafeHashBlock(estCount int) *unsafeHashBlock {
   307  	ht := new(unsafeHashBlock)
   308  	ht.hashMap = make(map[uint64]*entry, estCount)
   309  	ht.entryStore = newEntryStore()
   310  	return ht
   311  }
   312  
   313  // Put puts the key/rowPtr pairs to the unsafeHashBlock, multiple rowPtrs are stored in a list.
   314  func (ht *unsafeHashBlock) Put(hashKey uint64, rowPtr chunk.EventPtr) {
   315  	oldEntry := ht.hashMap[hashKey]
   316  	newEntry := ht.entryStore.GetStore()
   317  	newEntry.ptr = rowPtr
   318  	newEntry.next = oldEntry
   319  	ht.hashMap[hashKey] = newEntry
   320  	ht.length++
   321  }
   322  
   323  // Get gets the values of the "key" and appends them to "values".
   324  func (ht *unsafeHashBlock) Get(hashKey uint64) (rowPtrs []chunk.EventPtr) {
   325  	entryAddr := ht.hashMap[hashKey]
   326  	for entryAddr != nil {
   327  		rowPtrs = append(rowPtrs, entryAddr.ptr)
   328  		entryAddr = entryAddr.next
   329  	}
   330  	return
   331  }
   332  
   333  // Len returns the number of rowPtrs in the unsafeHashBlock, the number of keys may be less than Len
   334  // if the same key is put more than once.
   335  func (ht *unsafeHashBlock) Len() uint64 { return ht.length }
   336  
   337  // concurrentMapHashBlock is a concurrent hash causet built on concurrentMap
   338  type concurrentMapHashBlock struct {
   339  	hashMap    concurrentMap
   340  	entryStore *entryStore
   341  	length     uint64
   342  }
   343  
   344  // newConcurrentMapHashBlock creates a concurrentMapHashBlock
   345  func newConcurrentMapHashBlock() *concurrentMapHashBlock {
   346  	ht := new(concurrentMapHashBlock)
   347  	ht.hashMap = newConcurrentMap()
   348  	ht.entryStore = newEntryStore()
   349  	ht.length = 0
   350  	return ht
   351  }
   352  
   353  // Len return the number of rowPtrs in the concurrentMapHashBlock
   354  func (ht *concurrentMapHashBlock) Len() uint64 {
   355  	return ht.length
   356  }
   357  
   358  // Put puts the key/rowPtr pairs to the concurrentMapHashBlock, multiple rowPtrs are stored in a list.
   359  func (ht *concurrentMapHashBlock) Put(hashKey uint64, rowPtr chunk.EventPtr) {
   360  	newEntry := ht.entryStore.GetStore()
   361  	newEntry.ptr = rowPtr
   362  	newEntry.next = nil
   363  	ht.hashMap.Insert(hashKey, newEntry)
   364  	atomic.AddUint64(&ht.length, 1)
   365  }
   366  
   367  // Get gets the values of the "key" and appends them to "values".
   368  func (ht *concurrentMapHashBlock) Get(hashKey uint64) (rowPtrs []chunk.EventPtr) {
   369  	entryAddr, _ := ht.hashMap.Get(hashKey)
   370  	for entryAddr != nil {
   371  		rowPtrs = append(rowPtrs, entryAddr.ptr)
   372  		entryAddr = entryAddr.next
   373  	}
   374  	return
   375  }