github.com/coyove/sdss@v0.0.0-20231129015646-c2ec58cca6a2/contrib/bitmap/bitmap_test.go (about) 1 package bitmap 2 3 import ( 4 "bufio" 5 "bytes" 6 "encoding/csv" 7 "fmt" 8 "io/ioutil" 9 "log" 10 "math/rand" 11 "os" 12 "regexp" 13 "runtime" 14 "runtime/pprof" 15 "sort" 16 "strings" 17 "sync" 18 "testing" 19 "time" 20 21 "github.com/coyove/sdss/contrib/clock" 22 "github.com/coyove/sdss/contrib/ngram" 23 "github.com/coyove/sdss/contrib/roaring" 24 ) 25 26 const day = 86400 27 28 func lineOf(path string, ln []int) (res []string) { 29 sort.Ints(ln) 30 31 f, _ := os.Open(path) 32 defer f.Close() 33 rd := bufio.NewReader(f) 34 for i := 0; len(ln) > 0; i++ { 35 line, err := rd.ReadString('\n') 36 if err != nil { 37 break 38 } 39 if i == ln[0] { 40 AGAIN: 41 res = append(res, strings.TrimSpace(line)) 42 ln = ln[1:] 43 if len(ln) > 0 && ln[0] == i { 44 goto AGAIN 45 } 46 } 47 } 48 return 49 } 50 51 func TestBitmap2(t *testing.T) { 52 runtime.GOMAXPROCS(2) 53 now := clock.Unix() / day * day 54 rand.Seed(now) 55 56 if false { 57 rand.Seed(clock.Unix()) 58 m := roaring.New() 59 m2 := roaring.New() 60 ref := map[uint32][]uint32{} 61 const N = 1e6 62 const BF = 3 63 for i := 0; i < N; i++ { 64 AGAIN: 65 x := rand.Uint32() 66 if len(ref[x]) > 0 { 67 goto AGAIN 68 } 69 h := h16(x, 0) 70 for j := 0; j < 32; j += rand.Intn(2) + 1 { 71 y := uint32(j)*128 + rand.Uint32()%128 72 for bf := 0; bf < BF; bf++ { 73 m.Add(h[bf]&0xfffff000 | (y & 0xfff)) 74 // m2.Add(h[bf]&0xfffffc00 | (y & 0x3ff)) 75 } 76 ref[x] = append(ref[x], y) 77 } 78 } 79 ys0, total, overflows, total2 := 0, 0, map[int]int{}, 0 80 for x, ys := range ref { 81 h := h16(x, 0) 82 tmp := []*roaring.Bitmap{roaring.New(), roaring.New(), roaring.New(), roaring.New()}[:BF] 83 for i := 0; i < BF; i++ { 84 z := h[i] & 0xfffff000 85 iter := m.Iterator().(*roaring.IntIterator) 86 iter.Seek(z) 87 for first := true; iter.HasNext(); first = false { 88 v := iter.Next() 89 if v&0xfffff000 == z { 90 tmp[i].Add(v & 0xfff) 91 } else { 92 if first { 93 panic(fmt.Sprintf("%x %x", v, z)) 94 } 95 break 96 } 97 } 98 } 99 for i := 1; i < BF; i++ { 100 tmp[0].And(tmp[i]) 101 } 102 ys0 += len(ys) 103 total += int(tmp[0].GetCardinality()) 104 for _, y := range ys { 105 if !tmp[0].Contains(y) { 106 fmt.Println(ys) 107 panic(y) 108 } 109 tmp[0].Remove(y) 110 } 111 overflows[int(tmp[0].GetCardinality())]++ 112 } 113 // for x := range ref { 114 // h := h16(x, 0) 115 // tmp := [2]*roaring.Bitmap{roaring.New(), roaring.New()} 116 // for i := 0; i < 2; i++ { 117 // z := h[i] & 0xfffffc00 118 // iter := m2.Iterator().(*roaring.IntIterator) 119 // iter.Seek(z) 120 // for iter.HasNext() { 121 // if v := iter.Next(); v&0xfffffc00 == z { 122 // tmp[i].Add(v & 0x3ff) 123 // } else { 124 // break 125 // } 126 // } 127 // } 128 // tmp[0].And(tmp[1]) 129 // total2 += int(tmp[0].GetCardinality()) 130 // } 131 fmt.Println(ys0, total, m.GetCardinality()) 132 fmt.Println(ys0, total2, m2.GetCardinality()) 133 134 a := make([]int, 10000) 135 for k, n := range overflows { 136 a[k] = n 137 } 138 tot := 0 139 for i, a := range a { 140 tot += a 141 if tot >= int(N*0.99) { 142 fmt.Println("p99 at", i) 143 break 144 } 145 } 146 return 147 } 148 149 b := New(now) 150 cached, err := ioutil.ReadFile("cache") 151 if len(cached) > 0 { 152 b, err = Unmarshal(bytes.NewReader(cached)) 153 } 154 fmt.Println(err) 155 156 ba := b.AggregateSaves(func(b *Range) error { 157 _, err := b.Save("cache", false) 158 fmt.Println("save", err) 159 return err 160 }) 161 162 path := os.Getenv("HOME") + "/dataset/dataset/full_dataset.csv" 163 f, _ := os.Open(path) 164 defer f.Close() 165 166 rd := csv.NewReader(f) 167 for i := 0; true && i < 10000; i++ { 168 records, err := rd.Read() 169 if err != nil { 170 break 171 } 172 173 line := strings.Join(records, " ") 174 hs := []uint64{} 175 for k := range ngram.Split(string(line)) { 176 hs = append(hs, ngram.StrHash(k)) 177 } 178 hs = append(hs, uint64(i)) 179 ba.AddAsync(Uint64Key(uint64(i)), hs) 180 181 if i%1000 == 0 { 182 log.Println(i) 183 } 184 } 185 ba.Close() 186 187 fmt.Println(len(b.MarshalBinary(true)), b) 188 // b.Save("cache") 189 190 gs := ngram.Split("chinese") 191 if false { 192 gs = ngram.Split(`kernel corn"", ""1/2 pkg. (approximately 20) saltine crackers, crushed"", ""1 egg, beaten"", ""6 tsp. butter, divided"", ""pepper to taste""]","[""Mix 193 together both cans of corn, crackers, egg, 2 teaspoons of melted butter and pepper and place in a buttered baking dish."", ""Dot with remaining 4 teaspoons of butter."", ""Bake at 350\u00b0 for 1 hour.""]",www. 194 cookbooks.com/Recipe-Details.aspx?id=876969,Gathered,"[""cream-style corn"", ""whole kernel corn"", ""crackers"", ""egg"", ""butter"", ""pepper""]" `) 195 } 196 var q []uint64 197 for k := range gs { 198 q = append(q, ngram.StrHash(k)) 199 fmt.Println(k, "==>", ngram.StrHash(k)) 200 if len(q) > 32 { 201 break 202 } 203 } 204 205 { 206 f, _ := os.Create("cpuprofile") 207 defer f.Close() 208 pprof.StartCPUProfile(f) 209 defer pprof.StopCPUProfile() 210 } 211 212 start := time.Now() 213 var results []KeyIdScore 214 wg := sync.WaitGroup{} 215 for i := 0; i < 1; i++ { 216 wg.Add(1) 217 go func() { 218 defer wg.Done() 219 // results = b.Join(q, nil, 1670192109, 50, JoinMajor) 220 var tmp []KeyIdScore 221 fmt.Println(b.Join(Values{Major: q /* Oneof: []uint64{types.StrHash("cream")} */}, b.Start(), false, func(kis KeyIdScore) bool { 222 tmp = append(tmp, kis) 223 return len(tmp) < 50 224 })) 225 results = tmp 226 }() 227 } 228 wg.Wait() 229 fmt.Println((results), time.Since(start)) 230 hits := 0 231 232 sort.Slice(results, func(i, j int) bool { return results[i].Key.Less(results[j].Key) }) 233 lineNums := []int{} 234 for _, res := range results { 235 lineNums = append(lineNums, int(res.Key.LowUint64())) 236 } 237 lines := lineOf(path, lineNums) 238 for i, line := range lines { 239 s := 0 240 for _, v := range gs { 241 if m, _ := regexp.MatchString("(?i)"+v.Raw, line); m { 242 s++ 243 } 244 } 245 if s >= len(gs)/2 { 246 fmt.Println(results[i].Key.LowUint64(), results[i].Id, s) // line) 247 _ = i 248 hits++ 249 } 250 } 251 fmt.Println(time.Since(start), hits, len(lines)) 252 } 253 254 func TestCollision(t *testing.T) { 255 tot := 0 256 m2 := New(0) 257 for i := 0; i < 1e6; i++ { 258 m := roaring.New() 259 var v []uint64 260 for i := 0; i < 16; i++ { 261 x := rand.Uint64()&0xfffff0 + uint64(i) 262 m.Add(uint32(x)) 263 v = append(v, x) 264 } 265 m2.Add(Uint64Key(uint64(i)), v) 266 tot += int(m.GetSerializedSizeInBytes()) 267 } 268 fmt.Println(tot, len(m2.MarshalBinary(false)), m2.fastTable.GetSerializedSizeInBytes()) 269 return 270 271 rand.Seed(clock.Unix()) 272 x := []uint64{} 273 for i := 0; i < 1e3; i++ { 274 v := rand.Uint64() 275 x = append(x, v) 276 } 277 xf, vs := xfBuild(xfNew(x)) 278 for i := 0; ; i++ { 279 if xfContains(xf, vs, rand.Uint64()) { 280 panic(i) 281 } 282 } 283 } 284 285 func BenchmarkXorSmall(b *testing.B) { 286 var x []uint64 287 rand.Seed(clock.Unix()) 288 for i := 0; i < 5; i++ { 289 v := rand.Uint64() 290 x = append(x, v) 291 } 292 zzz := xfNew(x) 293 fmt.Println(len(zzz)) 294 for i := 0; i < b.N; i++ { 295 x, vs := xfBuild(zzz) 296 if !xfContains(x, vs, uint64(vs[len(vs)-1])) { 297 b.FailNow() 298 } 299 } 300 } 301 302 // func BenchmarkContainsBrute(b *testing.B) { 303 // var x []uint64 304 // rand.Seed(clock.Unix()) 305 // for i := 0; i < 6; i++ { 306 // v := rand.Uint64() 307 // x = append(x, v) 308 // } 309 // n := rand.Uint64() 310 // for i := 0; i < b.N; i++ { 311 // for _, v0 := range x { 312 // if v0 == n { 313 // break 314 // } 315 // } 316 // } 317 // } 318 // 319 // func BenchmarkContainsXor(b *testing.B) { 320 // var x []uint64 321 // rand.Seed(clock.Unix()) 322 // for i := 0; i < 6; i++ { 323 // v := rand.Uint64() 324 // x = append(x, v) 325 // } 326 // n := rand.Uint64() 327 // xf, _ := xorfilter.Populate(x) 328 // for i := 0; i < b.N; i++ { 329 // if xf.Contains(n) { 330 // break 331 // } 332 // } 333 // } 334 // 335 // func BenchmarkContainsBinary(b *testing.B) { 336 // var x []int 337 // rand.Seed(clock.Unix()) 338 // for i := 0; i < 6; i++ { 339 // v := rand.Uint64() 340 // x = append(x, int(v)) 341 // } 342 // n := int(rand.Uint64()) 343 // for i := 0; i < b.N; i++ { 344 // sort.SearchInts(x, n) 345 // } 346 // } 347 348 func TestManager(t *testing.T) { 349 m, _ := NewManager("mgr", 10, NewLRUCache(1e6)) 350 m.DirMaxFiles = 10 351 for i := 0; i < 1e3; i++ { 352 m.Saver().AddAsync(Uint64Key(uint64(i)), []uint64{uint64(i)}) 353 time.Sleep(time.Millisecond * 10) 354 } 355 time.Sleep(time.Second) 356 m.WalkDesc(clock.UnixNano()/1e6, func(m *Range) bool { 357 if m != nil { 358 fmt.Println(m.String()) 359 } 360 return true 361 }) 362 } 363 364 func TestJump(t *testing.T) { 365 366 }