github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/bitmap/bitmap_test.go (about)

     1  package bitmap
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"encoding/csv"
     7  	"fmt"
     8  	"io/ioutil"
     9  	"log"
    10  	"math/rand"
    11  	"os"
    12  	"regexp"
    13  	"runtime"
    14  	"runtime/pprof"
    15  	"sort"
    16  	"strings"
    17  	"sync"
    18  	"testing"
    19  	"time"
    20  
    21  	"github.com/coyove/sdss/contrib/clock"
    22  	"github.com/coyove/sdss/contrib/ngram"
    23  	"github.com/coyove/sdss/contrib/roaring"
    24  )
    25  
    26  const day = 86400
    27  
    28  func lineOf(path string, ln []int) (res []string) {
    29  	sort.Ints(ln)
    30  
    31  	f, _ := os.Open(path)
    32  	defer f.Close()
    33  	rd := bufio.NewReader(f)
    34  	for i := 0; len(ln) > 0; i++ {
    35  		line, err := rd.ReadString('\n')
    36  		if err != nil {
    37  			break
    38  		}
    39  		if i == ln[0] {
    40  		AGAIN:
    41  			res = append(res, strings.TrimSpace(line))
    42  			ln = ln[1:]
    43  			if len(ln) > 0 && ln[0] == i {
    44  				goto AGAIN
    45  			}
    46  		}
    47  	}
    48  	return
    49  }
    50  
    51  func TestBitmap2(t *testing.T) {
    52  	runtime.GOMAXPROCS(2)
    53  	now := clock.Unix() / day * day
    54  	rand.Seed(now)
    55  
    56  	if false {
    57  		rand.Seed(clock.Unix())
    58  		m := roaring.New()
    59  		m2 := roaring.New()
    60  		ref := map[uint32][]uint32{}
    61  		const N = 1e6
    62  		const BF = 3
    63  		for i := 0; i < N; i++ {
    64  		AGAIN:
    65  			x := rand.Uint32()
    66  			if len(ref[x]) > 0 {
    67  				goto AGAIN
    68  			}
    69  			h := h16(x, 0)
    70  			for j := 0; j < 32; j += rand.Intn(2) + 1 {
    71  				y := uint32(j)*128 + rand.Uint32()%128
    72  				for bf := 0; bf < BF; bf++ {
    73  					m.Add(h[bf]&0xfffff000 | (y & 0xfff))
    74  					// m2.Add(h[bf]&0xfffffc00 | (y & 0x3ff))
    75  				}
    76  				ref[x] = append(ref[x], y)
    77  			}
    78  		}
    79  		ys0, total, overflows, total2 := 0, 0, map[int]int{}, 0
    80  		for x, ys := range ref {
    81  			h := h16(x, 0)
    82  			tmp := []*roaring.Bitmap{roaring.New(), roaring.New(), roaring.New(), roaring.New()}[:BF]
    83  			for i := 0; i < BF; i++ {
    84  				z := h[i] & 0xfffff000
    85  				iter := m.Iterator().(*roaring.IntIterator)
    86  				iter.Seek(z)
    87  				for first := true; iter.HasNext(); first = false {
    88  					v := iter.Next()
    89  					if v&0xfffff000 == z {
    90  						tmp[i].Add(v & 0xfff)
    91  					} else {
    92  						if first {
    93  							panic(fmt.Sprintf("%x %x", v, z))
    94  						}
    95  						break
    96  					}
    97  				}
    98  			}
    99  			for i := 1; i < BF; i++ {
   100  				tmp[0].And(tmp[i])
   101  			}
   102  			ys0 += len(ys)
   103  			total += int(tmp[0].GetCardinality())
   104  			for _, y := range ys {
   105  				if !tmp[0].Contains(y) {
   106  					fmt.Println(ys)
   107  					panic(y)
   108  				}
   109  				tmp[0].Remove(y)
   110  			}
   111  			overflows[int(tmp[0].GetCardinality())]++
   112  		}
   113  		// for x := range ref {
   114  		// 	h := h16(x, 0)
   115  		// 	tmp := [2]*roaring.Bitmap{roaring.New(), roaring.New()}
   116  		// 	for i := 0; i < 2; i++ {
   117  		// 		z := h[i] & 0xfffffc00
   118  		// 		iter := m2.Iterator().(*roaring.IntIterator)
   119  		// 		iter.Seek(z)
   120  		// 		for iter.HasNext() {
   121  		// 			if v := iter.Next(); v&0xfffffc00 == z {
   122  		// 				tmp[i].Add(v & 0x3ff)
   123  		// 			} else {
   124  		// 				break
   125  		// 			}
   126  		// 		}
   127  		// 	}
   128  		// 	tmp[0].And(tmp[1])
   129  		// 	total2 += int(tmp[0].GetCardinality())
   130  		// }
   131  		fmt.Println(ys0, total, m.GetCardinality())
   132  		fmt.Println(ys0, total2, m2.GetCardinality())
   133  
   134  		a := make([]int, 10000)
   135  		for k, n := range overflows {
   136  			a[k] = n
   137  		}
   138  		tot := 0
   139  		for i, a := range a {
   140  			tot += a
   141  			if tot >= int(N*0.99) {
   142  				fmt.Println("p99 at", i)
   143  				break
   144  			}
   145  		}
   146  		return
   147  	}
   148  
   149  	b := New(now)
   150  	cached, err := ioutil.ReadFile("cache")
   151  	if len(cached) > 0 {
   152  		b, err = Unmarshal(bytes.NewReader(cached))
   153  	}
   154  	fmt.Println(err)
   155  
   156  	ba := b.AggregateSaves(func(b *Range) error {
   157  		_, err := b.Save("cache", false)
   158  		fmt.Println("save", err)
   159  		return err
   160  	})
   161  
   162  	path := os.Getenv("HOME") + "/dataset/dataset/full_dataset.csv"
   163  	f, _ := os.Open(path)
   164  	defer f.Close()
   165  
   166  	rd := csv.NewReader(f)
   167  	for i := 0; true && i < 10000; i++ {
   168  		records, err := rd.Read()
   169  		if err != nil {
   170  			break
   171  		}
   172  
   173  		line := strings.Join(records, " ")
   174  		hs := []uint64{}
   175  		for k := range ngram.Split(string(line)) {
   176  			hs = append(hs, ngram.StrHash(k))
   177  		}
   178  		hs = append(hs, uint64(i))
   179  		ba.AddAsync(Uint64Key(uint64(i)), hs)
   180  
   181  		if i%1000 == 0 {
   182  			log.Println(i)
   183  		}
   184  	}
   185  	ba.Close()
   186  
   187  	fmt.Println(len(b.MarshalBinary(true)), b)
   188  	// b.Save("cache")
   189  
   190  	gs := ngram.Split("chinese")
   191  	if false {
   192  		gs = ngram.Split(`kernel corn"", ""1/2 pkg. (approximately 20) saltine crackers, crushed"", ""1 egg, beaten"", ""6 tsp. butter, divided"", ""pepper to taste""]","[""Mix
   193   together both cans of corn, crackers, egg, 2 teaspoons of melted butter and pepper and place in a buttered baking dish."", ""Dot with remaining 4 teaspoons of butter."", ""Bake at 350\u00b0 for 1 hour.""]",www.
   194  cookbooks.com/Recipe-Details.aspx?id=876969,Gathered,"[""cream-style corn"", ""whole kernel corn"", ""crackers"", ""egg"", ""butter"", ""pepper""]" `)
   195  	}
   196  	var q []uint64
   197  	for k := range gs {
   198  		q = append(q, ngram.StrHash(k))
   199  		fmt.Println(k, "==>", ngram.StrHash(k))
   200  		if len(q) > 32 {
   201  			break
   202  		}
   203  	}
   204  
   205  	{
   206  		f, _ := os.Create("cpuprofile")
   207  		defer f.Close()
   208  		pprof.StartCPUProfile(f)
   209  		defer pprof.StopCPUProfile()
   210  	}
   211  
   212  	start := time.Now()
   213  	var results []KeyIdScore
   214  	wg := sync.WaitGroup{}
   215  	for i := 0; i < 1; i++ {
   216  		wg.Add(1)
   217  		go func() {
   218  			defer wg.Done()
   219  			// results = b.Join(q, nil, 1670192109, 50, JoinMajor)
   220  			var tmp []KeyIdScore
   221  			fmt.Println(b.Join(Values{Major: q /* Oneof: []uint64{types.StrHash("cream")} */}, b.Start(), false, func(kis KeyIdScore) bool {
   222  				tmp = append(tmp, kis)
   223  				return len(tmp) < 50
   224  			}))
   225  			results = tmp
   226  		}()
   227  	}
   228  	wg.Wait()
   229  	fmt.Println((results), time.Since(start))
   230  	hits := 0
   231  
   232  	sort.Slice(results, func(i, j int) bool { return results[i].Key.Less(results[j].Key) })
   233  	lineNums := []int{}
   234  	for _, res := range results {
   235  		lineNums = append(lineNums, int(res.Key.LowUint64()))
   236  	}
   237  	lines := lineOf(path, lineNums)
   238  	for i, line := range lines {
   239  		s := 0
   240  		for _, v := range gs {
   241  			if m, _ := regexp.MatchString("(?i)"+v.Raw, line); m {
   242  				s++
   243  			}
   244  		}
   245  		if s >= len(gs)/2 {
   246  			fmt.Println(results[i].Key.LowUint64(), results[i].Id, s) // line)
   247  			_ = i
   248  			hits++
   249  		}
   250  	}
   251  	fmt.Println(time.Since(start), hits, len(lines))
   252  }
   253  
   254  func TestCollision(t *testing.T) {
   255  	tot := 0
   256  	m2 := New(0)
   257  	for i := 0; i < 1e6; i++ {
   258  		m := roaring.New()
   259  		var v []uint64
   260  		for i := 0; i < 16; i++ {
   261  			x := rand.Uint64()&0xfffff0 + uint64(i)
   262  			m.Add(uint32(x))
   263  			v = append(v, x)
   264  		}
   265  		m2.Add(Uint64Key(uint64(i)), v)
   266  		tot += int(m.GetSerializedSizeInBytes())
   267  	}
   268  	fmt.Println(tot, len(m2.MarshalBinary(false)), m2.fastTable.GetSerializedSizeInBytes())
   269  	return
   270  
   271  	rand.Seed(clock.Unix())
   272  	x := []uint64{}
   273  	for i := 0; i < 1e3; i++ {
   274  		v := rand.Uint64()
   275  		x = append(x, v)
   276  	}
   277  	xf, vs := xfBuild(xfNew(x))
   278  	for i := 0; ; i++ {
   279  		if xfContains(xf, vs, rand.Uint64()) {
   280  			panic(i)
   281  		}
   282  	}
   283  }
   284  
   285  func BenchmarkXorSmall(b *testing.B) {
   286  	var x []uint64
   287  	rand.Seed(clock.Unix())
   288  	for i := 0; i < 5; i++ {
   289  		v := rand.Uint64()
   290  		x = append(x, v)
   291  	}
   292  	zzz := xfNew(x)
   293  	fmt.Println(len(zzz))
   294  	for i := 0; i < b.N; i++ {
   295  		x, vs := xfBuild(zzz)
   296  		if !xfContains(x, vs, uint64(vs[len(vs)-1])) {
   297  			b.FailNow()
   298  		}
   299  	}
   300  }
   301  
   302  // func BenchmarkContainsBrute(b *testing.B) {
   303  // 	var x []uint64
   304  // 	rand.Seed(clock.Unix())
   305  // 	for i := 0; i < 6; i++ {
   306  // 		v := rand.Uint64()
   307  // 		x = append(x, v)
   308  // 	}
   309  // 	n := rand.Uint64()
   310  // 	for i := 0; i < b.N; i++ {
   311  // 		for _, v0 := range x {
   312  // 			if v0 == n {
   313  // 				break
   314  // 			}
   315  // 		}
   316  // 	}
   317  // }
   318  //
   319  // func BenchmarkContainsXor(b *testing.B) {
   320  // 	var x []uint64
   321  // 	rand.Seed(clock.Unix())
   322  // 	for i := 0; i < 6; i++ {
   323  // 		v := rand.Uint64()
   324  // 		x = append(x, v)
   325  // 	}
   326  // 	n := rand.Uint64()
   327  // 	xf, _ := xorfilter.Populate(x)
   328  // 	for i := 0; i < b.N; i++ {
   329  // 		if xf.Contains(n) {
   330  // 			break
   331  // 		}
   332  // 	}
   333  // }
   334  //
   335  // func BenchmarkContainsBinary(b *testing.B) {
   336  // 	var x []int
   337  // 	rand.Seed(clock.Unix())
   338  // 	for i := 0; i < 6; i++ {
   339  // 		v := rand.Uint64()
   340  // 		x = append(x, int(v))
   341  // 	}
   342  // 	n := int(rand.Uint64())
   343  // 	for i := 0; i < b.N; i++ {
   344  // 		sort.SearchInts(x, n)
   345  // 	}
   346  // }
   347  
   348  func TestManager(t *testing.T) {
   349  	m, _ := NewManager("mgr", 10, NewLRUCache(1e6))
   350  	m.DirMaxFiles = 10
   351  	for i := 0; i < 1e3; i++ {
   352  		m.Saver().AddAsync(Uint64Key(uint64(i)), []uint64{uint64(i)})
   353  		time.Sleep(time.Millisecond * 10)
   354  	}
   355  	time.Sleep(time.Second)
   356  	m.WalkDesc(clock.UnixNano()/1e6, func(m *Range) bool {
   357  		if m != nil {
   358  			fmt.Println(m.String())
   359  		}
   360  		return true
   361  	})
   362  }
   363  
   364  func TestJump(t *testing.T) {
   365  
   366  }