github.com/dgraph-io/sroar@v0.0.0-20220527172339-b92b7eaaf6e0/real_data_test.go (about) 1 // +build real 2 3 package sroar 4 5 import ( 6 "archive/zip" 7 "bytes" 8 "fmt" 9 "io" 10 "os" 11 "path" 12 "strconv" 13 "strings" 14 "testing" 15 16 "github.com/pkg/errors" 17 "github.com/stretchr/testify/require" 18 ) 19 20 // To run these benchmarks: go test -bench BenchmarkRealDataFastOr -run - 21 22 var realDatasets = []string{ 23 "census-income_srt", "census-income", "census1881_srt", "census1881", 24 "dimension_003", "dimension_008", "dimension_033", "uscensus2000", "weather_sept_85_srt", 25 "weather_sept_85", "wikileaks-noquotes_srt", "wikileaks-noquotes", 26 } 27 28 func getDataSetPath(dataset string) (string, error) { 29 gopath, ok := os.LookupEnv("GOPATH") 30 if !ok { 31 return "", fmt.Errorf("GOPATH not set. It's required to locate real-roaring-dataset.") 32 } 33 34 basePath := path.Join(gopath, "src", "github.com", "RoaringBitmap", "real-roaring-datasets") 35 if _, err := os.Stat(basePath); os.IsNotExist(err) { 36 return "", fmt.Errorf("real-roaring-datasets does not exist. " + 37 "Run `go get github.com/RoaringBitmap/real-roaring-datasets`") 38 } 39 40 datasetPath := path.Join(basePath, dataset+".zip") 41 if _, err := os.Stat(datasetPath); os.IsNotExist(err) { 42 return "", fmt.Errorf("dataset %s does not exist, tried path: %s", 43 dataset, datasetPath) 44 } 45 return datasetPath, nil 46 } 47 48 func retrieveRealDataBitmaps(datasetName string, optimize bool) ([]*Bitmap, error) { 49 datasetPath, err := getDataSetPath(datasetName) 50 zipFile, err := zip.OpenReader(datasetPath) 51 if err != nil { 52 return nil, fmt.Errorf("error opening dataset %s zipfile, cause: %v", datasetPath, err) 53 } 54 defer zipFile.Close() 55 56 bitmaps := make([]*Bitmap, len(zipFile.File)) 57 for i, f := range zipFile.File { 58 res, err := processZipFile(f) 59 if err != nil { 60 return nil, errors.Wrap(err, "while processing zip file") 61 } 62 b := NewBitmap() 63 for _, v := range res { 64 b.Set(v) 65 } 66 bitmaps[i] = b 67 } 68 69 return bitmaps, nil 70 } 71 72 func processZipFile(f *zip.File) ([]uint64, error) { 73 r, err := f.Open() 74 if err != nil { 75 return nil, fmt.Errorf("failed to read bitmap file %s, cause: %v", 76 f.Name, err) 77 } 78 79 buf := make([]byte, f.UncompressedSize) 80 var bufStep uint64 = 32768 // apparently the largest buffer zip can read 81 var totalReadBytes uint64 82 83 for { 84 var endOffset uint64 85 if f.UncompressedSize64 < totalReadBytes+bufStep { 86 endOffset = f.UncompressedSize64 87 } else { 88 endOffset = totalReadBytes + bufStep 89 } 90 91 readBytes, err := r.Read(buf[totalReadBytes:endOffset]) 92 totalReadBytes += uint64(readBytes) 93 94 if err == io.EOF { 95 r.Close() 96 break 97 } else if err != nil { 98 r.Close() 99 return nil, fmt.Errorf("could not read content of file %s , err: %v", 100 f.Name, err) 101 } 102 } 103 104 elemsAsBytes := bytes.Split(buf[:totalReadBytes], []byte{44}) // 44 is a comma 105 106 var result []uint64 107 for _, elemBytes := range elemsAsBytes { 108 elemStr := strings.TrimSpace(string(elemBytes)) 109 110 e, err := strconv.ParseUint(elemStr, 10, 32) 111 if err != nil { 112 r.Close() 113 return nil, fmt.Errorf("could not parse %s as uint32. Reading %s, err: %v", 114 elemStr, f.Name, err) 115 } 116 result = append(result, e) 117 } 118 return result, nil 119 } 120 121 func benchmarkRealDataAggregate(b *testing.B, aggregator func(b []*Bitmap) int) { 122 for _, dataset := range realDatasets { 123 once := false 124 b.Run(dataset, func(b *testing.B) { 125 bitmaps, err := retrieveRealDataBitmaps(dataset, true) 126 if err != nil { 127 b.Fatal(err) 128 } 129 if once { 130 c := aggregator(bitmaps) 131 b.Logf("Dataset: %s Got cardinality: %d\n", dataset, c) 132 once = false 133 } 134 b.ResetTimer() 135 for i := 0; i < b.N; i++ { 136 aggregator(bitmaps) 137 } 138 }) 139 } 140 } 141 142 func BenchmarkRealDataFastOr(b *testing.B) { 143 benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) int { 144 return FastOr(bitmaps...).GetCardinality() 145 }) 146 } 147 func BenchmarkRealDataFastParOr(b *testing.B) { 148 benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) int { 149 return FastParOr(4, bitmaps...).GetCardinality() 150 }) 151 } 152 153 func BenchmarkRealDataFastAnd(b *testing.B) { 154 benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) int { 155 return FastAnd(bitmaps...).GetCardinality() 156 }) 157 } 158 159 func TestOrRealData(t *testing.T) { 160 test := func(t *testing.T, dataset string) { 161 path, err := getDataSetPath(dataset) 162 require.NoError(t, err) 163 164 zipFile, err := zip.OpenReader(path) 165 require.NoError(t, err) 166 defer zipFile.Close() 167 168 bitmaps := make([]*Bitmap, len(zipFile.File)) 169 valMap := make(map[uint64]struct{}) 170 171 res2 := NewBitmap() 172 // For each file in the zip, create a new bitmap and check the created bitmap has correct 173 // cardinality as well as it has all the elements. 174 for i, f := range zipFile.File { 175 vals, err := processZipFile(f) 176 require.NoError(t, err) 177 178 b := NewBitmap() 179 for _, v := range vals { 180 b.Set(v) 181 res2.Set(v) 182 valMap[v] = struct{}{} 183 } 184 require.Equal(t, len(vals), b.GetCardinality()) 185 for _, v := range vals { 186 require.True(t, b.Contains(v)) 187 } 188 bitmaps[i] = b 189 } 190 191 // Check that union operation is correct. 192 res := FastOr(bitmaps...) 193 194 t.Logf("Result: %s\n", res) 195 require.Equal(t, len(valMap), res.GetCardinality()) 196 require.Equal(t, len(valMap), res2.GetCardinality()) 197 198 for k := range valMap { 199 require.True(t, res.Contains(k)) 200 require.True(t, res2.Contains(k)) 201 } 202 } 203 204 for _, dataset := range realDatasets { 205 t.Run(dataset, func(t *testing.T) { test(t, dataset) }) 206 } 207 }