github.com/outcaste-io/sroar@v0.0.0-20221229172112-1fb64f14314c/real_data_test.go (about)

     1  // +build real
     2  
     3  package sroar
     4  
     5  import (
     6  	"archive/zip"
     7  	"bytes"
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"path"
    12  	"strconv"
    13  	"strings"
    14  	"testing"
    15  
    16  	"github.com/pkg/errors"
    17  	"github.com/stretchr/testify/require"
    18  )
    19  
    20  // To run these benchmarks: go test -bench BenchmarkRealDataFastOr -run -
    21  
    22  var realDatasets = []string{
    23  	"census-income_srt", "census-income", "census1881_srt", "census1881",
    24  	"dimension_003", "dimension_008", "dimension_033", "uscensus2000", "weather_sept_85_srt",
    25  	"weather_sept_85", "wikileaks-noquotes_srt", "wikileaks-noquotes",
    26  }
    27  
    28  func getDataSetPath(dataset string) (string, error) {
    29  	gopath, ok := os.LookupEnv("GOPATH")
    30  	if !ok {
    31  		return "", fmt.Errorf("GOPATH not set. It's required to locate real-roaring-dataset.")
    32  	}
    33  
    34  	basePath := path.Join(gopath, "src", "github.com", "RoaringBitmap", "real-roaring-datasets")
    35  	if _, err := os.Stat(basePath); os.IsNotExist(err) {
    36  		return "", fmt.Errorf("real-roaring-datasets does not exist. " +
    37  			"Run `go get github.com/RoaringBitmap/real-roaring-datasets`")
    38  	}
    39  
    40  	datasetPath := path.Join(basePath, dataset+".zip")
    41  	if _, err := os.Stat(datasetPath); os.IsNotExist(err) {
    42  		return "", fmt.Errorf("dataset %s does not exist, tried path: %s",
    43  			dataset, datasetPath)
    44  	}
    45  	return datasetPath, nil
    46  }
    47  
    48  func retrieveRealDataBitmaps(datasetName string, optimize bool) ([]*Bitmap, error) {
    49  	datasetPath, err := getDataSetPath(datasetName)
    50  	zipFile, err := zip.OpenReader(datasetPath)
    51  	if err != nil {
    52  		return nil, fmt.Errorf("error opening dataset %s zipfile, cause: %v", datasetPath, err)
    53  	}
    54  	defer zipFile.Close()
    55  
    56  	bitmaps := make([]*Bitmap, len(zipFile.File))
    57  	for i, f := range zipFile.File {
    58  		res, err := processZipFile(f)
    59  		if err != nil {
    60  			return nil, errors.Wrap(err, "while processing zip file")
    61  		}
    62  		b := NewBitmap()
    63  		for _, v := range res {
    64  			b.Set(v)
    65  		}
    66  		bitmaps[i] = b
    67  	}
    68  
    69  	return bitmaps, nil
    70  }
    71  
    72  func processZipFile(f *zip.File) ([]uint64, error) {
    73  	r, err := f.Open()
    74  	if err != nil {
    75  		return nil, fmt.Errorf("failed to read bitmap file %s, cause: %v",
    76  			f.Name, err)
    77  	}
    78  
    79  	buf := make([]byte, f.UncompressedSize)
    80  	var bufStep uint64 = 32768 // apparently the largest buffer zip can read
    81  	var totalReadBytes uint64
    82  
    83  	for {
    84  		var endOffset uint64
    85  		if f.UncompressedSize64 < totalReadBytes+bufStep {
    86  			endOffset = f.UncompressedSize64
    87  		} else {
    88  			endOffset = totalReadBytes + bufStep
    89  		}
    90  
    91  		readBytes, err := r.Read(buf[totalReadBytes:endOffset])
    92  		totalReadBytes += uint64(readBytes)
    93  
    94  		if err == io.EOF {
    95  			r.Close()
    96  			break
    97  		} else if err != nil {
    98  			r.Close()
    99  			return nil, fmt.Errorf("could not read content of file %s , err: %v",
   100  				f.Name, err)
   101  		}
   102  	}
   103  
   104  	elemsAsBytes := bytes.Split(buf[:totalReadBytes], []byte{44}) // 44 is a comma
   105  
   106  	var result []uint64
   107  	for _, elemBytes := range elemsAsBytes {
   108  		elemStr := strings.TrimSpace(string(elemBytes))
   109  
   110  		e, err := strconv.ParseUint(elemStr, 10, 32)
   111  		if err != nil {
   112  			r.Close()
   113  			return nil, fmt.Errorf("could not parse %s as uint32. Reading %s, err: %v",
   114  				elemStr, f.Name, err)
   115  		}
   116  		result = append(result, e)
   117  	}
   118  	return result, nil
   119  }
   120  
   121  func benchmarkRealDataAggregate(b *testing.B, aggregator func(b []*Bitmap) int) {
   122  	for _, dataset := range realDatasets {
   123  		once := false
   124  		b.Run(dataset, func(b *testing.B) {
   125  			bitmaps, err := retrieveRealDataBitmaps(dataset, true)
   126  			if err != nil {
   127  				b.Fatal(err)
   128  			}
   129  			if once {
   130  				c := aggregator(bitmaps)
   131  				b.Logf("Dataset: %s Got cardinality: %d\n", dataset, c)
   132  				once = false
   133  			}
   134  			b.ResetTimer()
   135  			for i := 0; i < b.N; i++ {
   136  				aggregator(bitmaps)
   137  			}
   138  		})
   139  	}
   140  }
   141  
   142  func BenchmarkRealDataFastOr(b *testing.B) {
   143  	benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) int {
   144  		return FastOr(bitmaps...).GetCardinality()
   145  	})
   146  }
   147  func BenchmarkRealDataFastParOr(b *testing.B) {
   148  	benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) int {
   149  		return FastParOr(4, bitmaps...).GetCardinality()
   150  	})
   151  }
   152  
   153  func BenchmarkRealDataFastAnd(b *testing.B) {
   154  	benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) int {
   155  		return FastAnd(bitmaps...).GetCardinality()
   156  	})
   157  }
   158  
   159  func TestOrRealData(t *testing.T) {
   160  	test := func(t *testing.T, dataset string) {
   161  		path, err := getDataSetPath(dataset)
   162  		require.NoError(t, err)
   163  
   164  		zipFile, err := zip.OpenReader(path)
   165  		require.NoError(t, err)
   166  		defer zipFile.Close()
   167  
   168  		bitmaps := make([]*Bitmap, len(zipFile.File))
   169  		valMap := make(map[uint64]struct{})
   170  
   171  		res2 := NewBitmap()
   172  		// For each file in the zip, create a new bitmap and check the created bitmap has correct
   173  		// cardinality as well as it has all the elements.
   174  		for i, f := range zipFile.File {
   175  			vals, err := processZipFile(f)
   176  			require.NoError(t, err)
   177  
   178  			b := NewBitmap()
   179  			for _, v := range vals {
   180  				b.Set(v)
   181  				res2.Set(v)
   182  				valMap[v] = struct{}{}
   183  			}
   184  			require.Equal(t, len(vals), b.GetCardinality())
   185  			for _, v := range vals {
   186  				require.True(t, b.Contains(v))
   187  			}
   188  			bitmaps[i] = b
   189  		}
   190  
   191  		// Check that union operation is correct.
   192  		res := FastOr(bitmaps...)
   193  
   194  		t.Logf("Result: %s\n", res)
   195  		require.Equal(t, len(valMap), res.GetCardinality())
   196  		require.Equal(t, len(valMap), res2.GetCardinality())
   197  
   198  		for k := range valMap {
   199  			require.True(t, res.Contains(k))
   200  			require.True(t, res2.Contains(k))
   201  		}
   202  	}
   203  
   204  	for _, dataset := range realDatasets {
   205  		t.Run(dataset, func(t *testing.T) { test(t, dataset) })
   206  	}
   207  }