github.com/apache/arrow/go/v14@v14.0.2/internal/hashing/xxh3_memo_table.gen.go.tmpl (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package hashing
    18  
    19  import (
    20    "github.com/apache/arrow/go/v14/arrow/bitutil"  
    21    "github.com/apache/arrow/go/v14/internal/utils"  
    22  )
    23  
    24  {{range .In}}
    25  type payload{{.Name}} struct {
    26  	val     {{.name}}
    27  	memoIdx int32
    28  }
    29  
    30  type entry{{.Name}} struct {
    31  	h       uint64
    32  	payload payload{{.Name}}
    33  }
    34  
    35  func (e entry{{.Name}}) Valid() bool { return e.h != sentinel }
    36  
    37  // {{.Name}}HashTable is a hashtable specifically for {{.name}} that
    38  // is utilized with the MemoTable to generalize interactions for easier
    39  // implementation of dictionaries without losing performance.
    40  type {{.Name}}HashTable struct {
    41  	cap     uint64
    42  	capMask uint64
    43  	size    uint64
    44  
    45  	entries []entry{{.Name}}
    46  }
    47  
    48  // New{{.Name}}HashTable returns a new hash table for {{.name}} values
    49  // initialized with the passed in capacity or 32 whichever is larger.
    50  func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable {
    51  	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
    52  	ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0}
    53  	ret.entries = make([]entry{{.Name}}, initCap)
    54  	return ret
    55  }
    56  
    57  // Reset drops all of the values in this hash table and re-initializes it
    58  // with the specified initial capacity as if by calling New, but without having
    59  // to reallocate the object.
    60  func (h *{{.Name}}HashTable) Reset(cap uint64) {
    61  	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
    62  	h.capMask = h.cap - 1
    63  	h.size = 0
    64  	h.entries = make([]entry{{.Name}}, h.cap)
    65  }
    66  
    67  // CopyValues is used for copying the values out of the hash table into the
    68  // passed in slice, in the order that they were first inserted
    69  func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) {
    70    h.CopyValuesSubset(0, out)
    71  }
    72  
    73  // CopyValuesSubset copies a subset of the values in the hashtable out, starting
    74  // with the value at start, in the order that they were inserted.
    75  func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) {
    76    h.VisitEntries(func(e *entry{{.Name}}) {
    77      idx := e.payload.memoIdx - int32(start)
    78      if idx >= 0 {
    79        out[idx] = e.payload.val
    80      }
    81    })
    82  }
    83  
    84  func (h *{{.Name}}HashTable) WriteOut(out []byte) {
    85    h.WriteOutSubset(0, out)
    86  }
    87  
    88  func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) {
    89    data := arrow.{{.Name}}Traits.CastFromBytes(out)
    90    h.VisitEntries(func(e *entry{{.Name}}) {
    91      idx := e.payload.memoIdx - int32(start)
    92      if idx >= 0 {
    93  {{if and (ne .Name "Int8") (ne .Name "Uint8") -}}    
    94        data[idx] = utils.ToLE{{.Name}}(e.payload.val)
    95  {{else -}}
    96        data[idx] = e.payload.val
    97  {{end -}}
    98      }
    99    })
   100  }
   101  
   102  func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
   103  
   104  func ({{.Name}}HashTable) fixHash(v uint64) uint64 {
   105  	if v == sentinel {
   106  		return 42
   107  	}
   108  	return v
   109  }
   110  
   111  // Lookup retrieves the entry for a given hash value assuming it's payload value returns
   112  // true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
   113  // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
   114  func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) {
   115  	idx, ok := h.lookup(v, h.capMask, cmp)
   116  	return &h.entries[idx], ok
   117  }
   118  
   119  func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) {
   120  	const perturbShift uint8 = 5
   121  
   122  	var (
   123  		idx     uint64
   124  		perturb uint64
   125  		e       *entry{{.Name}}
   126  	)
   127  
   128  	v = h.fixHash(v)
   129  	idx = v & szMask
   130  	perturb = (v >> uint64(perturbShift)) + 1
   131  
   132  	for {
   133  		e = &h.entries[idx]
   134  		if e.h == v && cmp(e.payload.val) {
   135  			return idx, true
   136  		}
   137  
   138  		if e.h == sentinel {
   139  			return idx, false
   140  		}
   141  
   142  		// perturbation logic inspired from CPython's set/dict object
   143  		// the goal is that all 64 bits of unmasked hash value eventually
   144  		// participate int he probing sequence, to minimize clustering
   145  		idx = (idx + perturb) & szMask
   146  		perturb = (perturb >> uint64(perturbShift)) + 1
   147  	}
   148  }
   149  
   150  func (h *{{.Name}}HashTable) upsize(newcap uint64) error {
   151  	newMask := newcap - 1
   152  
   153  	oldEntries := h.entries
   154  	h.entries = make([]entry{{.Name}}, newcap)
   155  	for _, e := range oldEntries {
   156  		if e.Valid() {
   157  			idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false })
   158  			h.entries[idx] = e
   159  		}
   160  	}
   161  	h.cap = newcap
   162  	h.capMask = newMask
   163  	return nil
   164  }
   165  
   166  // Insert updates the given entry with the provided hash value, payload value and memo index.
   167  // The entry pointer must have been retrieved via lookup in order to actually insert properly.
   168  func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error {
   169  	e.h = h.fixHash(v)
   170  	e.payload.val = val
   171  	e.payload.memoIdx = memoIdx
   172  	h.size++
   173  
   174  	if h.needUpsize() {
   175  		h.upsize(h.cap * uint64(loadFactor) * 2)
   176  	}
   177  	return nil
   178  }
   179  
   180  // VisitEntries will call the passed in function on each *valid* entry in the hash table,
   181  // a valid entry being one which has had a value inserted into it.
   182  func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) {
   183  	for _, e := range h.entries {
   184  		if e.Valid() {
   185  			visit(&e)
   186  		}
   187  	}
   188  }
   189  
   190  // {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface
   191  // conforming to the MemoTable interface defined in the encoding package for general interactions
   192  // regarding dictionaries.
   193  type {{.Name}}MemoTable struct {
   194    tbl *{{.Name}}HashTable
   195    nullIdx int32
   196  }
   197  
   198  // New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further
   199  // allocations when inserting.
   200  func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable {
   201    return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound}
   202  }
   203  
   204  func ({{.Name}}MemoTable) TypeTraits() TypeTraits {
   205    return arrow.{{.Name}}Traits
   206  }
   207  
   208  // Reset allows this table to be re-used by dumping all the data currently in the table.
   209  func (s *{{.Name}}MemoTable) Reset() {
   210    s.tbl.Reset(32)
   211    s.nullIdx = KeyNotFound
   212  }
   213  
   214  // Size returns the current number of inserted elements into the table including if a null
   215  // has been inserted.
   216  func (s *{{.Name}}MemoTable) Size() int {
   217    sz := int(s.tbl.size)
   218    if _, ok := s.GetNull(); ok {
   219      sz++
   220    }
   221    return sz
   222  }
   223  
   224  // GetNull returns the index of an inserted null or KeyNotFound along with a bool
   225  // that will be true if found and false if not.
   226  func (s *{{.Name}}MemoTable) GetNull() (int, bool) {
   227    return int(s.nullIdx), s.nullIdx != KeyNotFound
   228  }
   229  
   230  // GetOrInsertNull will return the index of the null entry or insert a null entry
   231  // if one currently doesn't exist. The found value will be true if there was already
   232  // a null in the table, and false if it inserted one.
   233  func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) {
   234    idx, found = s.GetNull()
   235    if !found {
   236      idx = s.Size()
   237      s.nullIdx = int32(idx)
   238    }
   239    return
   240  }
   241  
   242  // CopyValues will copy the values from the memo table out into the passed in slice
   243  // which must be of the appropriate type.
   244  func (s *{{.Name}}MemoTable) CopyValues(out interface{}) {
   245    s.CopyValuesSubset(0, out)
   246  }
   247  
   248  // CopyValuesSubset is like CopyValues but only copies a subset of values starting
   249  // at the provided start index
   250  func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) {
   251    s.tbl.CopyValuesSubset(start, out.([]{{.name}}))
   252  }
   253  
   254  func (s *{{.Name}}MemoTable) WriteOut(out []byte) {
   255    s.tbl.CopyValues(arrow.{{.Name}}Traits.CastFromBytes(out))
   256  }
   257  
   258  func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) {
   259    s.tbl.CopyValuesSubset(start, arrow.{{.Name}}Traits.CastFromBytes(out))
   260  }
   261  
   262  func (s *{{.Name}}MemoTable) WriteOutLE(out []byte) {
   263    s.tbl.WriteOut(out)
   264  }
   265  
   266  func (s *{{.Name}}MemoTable) WriteOutSubsetLE(start int, out []byte) {
   267    s.tbl.WriteOutSubset(start, out)
   268  }
   269  
   270  // Get returns the index of the requested value in the hash table or KeyNotFound
   271  // along with a boolean indicating if it was found or not.
   272  func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) {
   273  {{if and (ne .Name "Float32") (ne .Name "Float64") }}
   274    h := hashInt(uint64(val.({{.name}})), 0)
   275    if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok {
   276  {{ else -}}
   277    var cmp func({{.name}}) bool
   278    {{if eq .Name "Float32"}}
   279    if math.IsNaN(float64(val.(float32))) {
   280      cmp = isNan32Cmp
   281      // use consistent internal bit pattern for NaN regardless of the pattern
   282      // that is passed to us. NaN is NaN is NaN
   283      val = float32(math.NaN())
   284    {{ else -}}
   285    if math.IsNaN(val.(float64)) {
   286      cmp = math.IsNaN
   287      // use consistent internal bit pattern for NaN regardless of the pattern
   288      // that is passed to us. NaN is NaN is NaN
   289      val = math.NaN()
   290    {{end -}}
   291    } else {
   292      cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
   293    }
   294  
   295    h := hash{{.Name}}(val.({{.name}}), 0)  
   296    if e, ok := s.tbl.Lookup(h, cmp); ok {
   297  {{ end -}}
   298      return int(e.payload.memoIdx), ok
   299    }
   300    return KeyNotFound, false
   301  }
   302  
   303  // GetOrInsert will return the index of the specified value in the table, or insert the
   304  // value into the table and return the new index. found indicates whether or not it already
   305  // existed in the table (true) or was inserted by this call (false).
   306  func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   307    {{if and (ne .Name "Float32") (ne .Name "Float64") }}
   308    h := hashInt(uint64(val.({{.name}})), 0)
   309    e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool {
   310      return val.({{.name}}) == v
   311    })
   312  {{ else }}  
   313    var cmp func({{.name}}) bool
   314    {{if eq .Name "Float32"}}
   315    if math.IsNaN(float64(val.(float32))) {
   316      cmp = isNan32Cmp
   317      // use consistent internal bit pattern for NaN regardless of the pattern
   318      // that is passed to us. NaN is NaN is NaN
   319      val = float32(math.NaN()) 
   320    {{ else -}}
   321    if math.IsNaN(val.(float64)) {  
   322      cmp = math.IsNaN
   323      // use consistent internal bit pattern for NaN regardless of the pattern
   324      // that is passed to us. NaN is NaN is NaN
   325      val = math.NaN()
   326    {{end -}}
   327    } else {
   328      cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
   329    }
   330    
   331    h := hash{{.Name}}(val.({{.name}}), 0)
   332    e, ok := s.tbl.Lookup(h, cmp)
   333  {{ end }}
   334    if ok {
   335      idx = int(e.payload.memoIdx)
   336      found = true
   337    } else {
   338      idx = s.Size()
   339      s.tbl.Insert(e, h, val.({{.name}}), int32(idx))
   340    }
   341    return
   342  }
   343  
   344  
   345  // GetOrInsertBytes is unimplemented
   346  func (s *{{.Name}}MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
   347      panic("unimplemented")
   348  }
   349  {{end}}