github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/surf/testdata/gen.go (about)

     1  package main
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"compress/gzip"
     7  	"encoding/binary"
     8  	"fmt"
     9  	"math/rand"
    10  	"os"
    11  	"sort"
    12  	"time"
    13  
    14  	"github.com/brianvoe/gofakeit"
    15  )
    16  
    17  func main() {
    18  	gofakeit.Seed(time.Now().Unix())
    19  
    20  	fakeSet := []fakeData{
    21  		{"street", func() string { return gofakeit.Address().Address }, 10000000},
    22  		{"url", gofakeit.URL, 10000000},
    23  		{"email", gofakeit.Email, 10000000},
    24  		{"uuid", gofakeit.UUID, 10000000},
    25  		{"ipv4", gofakeit.IPv4Address, 10000000},
    26  		{"ipv6", gofakeit.IPv6Address, 20000000},
    27  		{"username", gofakeit.Username, 4000000},
    28  	}
    29  	for _, w := range fakeSet {
    30  		w.generate()
    31  	}
    32  
    33  	randSet := []randomData{
    34  		{10000000, 10, 0},  // dense dataset
    35  		{10000000, 100, 0}, // sparse dataset
    36  		{100000, 100, 3},   // sparse prefix dataset
    37  		{200, 5, 300},      // dense prefix dataset
    38  	}
    39  	for _, w := range randSet {
    40  		w.generate()
    41  	}
    42  }
    43  
    44  type fakeData struct {
    45  	name string
    46  	f    func() string
    47  	n    int
    48  }
    49  
    50  func (w *fakeData) generate() {
    51  	fmt.Printf("generating %d %s...", w.n, w.name)
    52  	dedup := make(map[string]struct{}, w.n)
    53  	progress := 10
    54  	for len(dedup) < w.n {
    55  		dedup[w.f()] = struct{}{}
    56  
    57  		p := int(float64(len(dedup)) / float64(w.n) * 100.0)
    58  		if p%10 == 0 && p >= progress {
    59  			fmt.Printf("%d...", progress)
    60  			progress += 10
    61  		}
    62  	}
    63  
    64  	fmt.Print("sorting...")
    65  	keys := make([][]byte, 0, w.n)
    66  	for k := range dedup {
    67  		keys = append(keys, []byte(k))
    68  	}
    69  	dedup = nil
    70  	sort.Slice(keys, func(i, j int) bool { return bytes.Compare(keys[i], keys[j]) < 0 })
    71  
    72  	fmt.Print("writing...")
    73  	output(fmt.Sprintf("%s_%d.gz", w.name, w.n), keys)
    74  	fmt.Println("done")
    75  }
    76  
    77  func output(filename string, data [][]byte) {
    78  	f, err := os.OpenFile(filename, os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm)
    79  	if err != nil {
    80  		panic(err)
    81  	}
    82  	defer f.Close()
    83  	buf := bufio.NewWriter(f)
    84  	defer buf.Flush()
    85  	compressed := gzip.NewWriter(buf)
    86  	defer compressed.Close()
    87  
    88  	for _, b := range data {
    89  		if len(b) > 65535 {
    90  			panic("key length overflow")
    91  		}
    92  		var lenBuf [2]byte
    93  		binary.LittleEndian.PutUint16(lenBuf[:], uint16(len(b)))
    94  
    95  		_, err := compressed.Write(lenBuf[:])
    96  		if err != nil {
    97  			panic(err)
    98  		}
    99  		_, err = compressed.Write(b)
   100  		if err != nil {
   101  			panic(err)
   102  		}
   103  	}
   104  }
   105  
   106  type randomData struct {
   107  	initSize, initLen, round int
   108  }
   109  
   110  func (w *randomData) generate() {
   111  	fmt.Printf("generating %d-%d-%d rand data...", w.initSize, w.initLen, w.round)
   112  	start := time.Now()
   113  	keys := make([][]byte, w.initSize)
   114  	rand := rand.New(rand.NewSource(start.Unix()))
   115  	fmt.Printf("init round...")
   116  	for i := range keys {
   117  		keys[i] = make([]byte, rand.Intn(w.initLen)+1)
   118  		rand.Read(keys[i])
   119  	}
   120  
   121  	for r := 1; r <= w.round; r++ {
   122  		for i := 0; i < w.initSize*r; i++ {
   123  			k := make([]byte, len(keys[i])+rand.Intn(w.initLen)+1)
   124  			copy(k, keys[i])
   125  			rand.Read(k[len(keys[i]):])
   126  			keys = append(keys, k)
   127  		}
   128  		fmt.Printf("round %d...", r)
   129  	}
   130  
   131  	fmt.Print("sorting...")
   132  	sort.Slice(keys, func(i, j int) bool {
   133  		return bytes.Compare(keys[i], keys[j]) < 0
   134  	})
   135  
   136  	fmt.Print("dedup...")
   137  	var prev []byte
   138  	result := keys[:0]
   139  	for _, k := range keys {
   140  		if bytes.Equal(prev, k) {
   141  			continue
   142  		}
   143  		prev = k
   144  		result = append(result, k)
   145  	}
   146  	for i := len(result); i < len(keys); i++ {
   147  		keys[i] = nil
   148  	}
   149  
   150  	fmt.Print("writing...")
   151  	output(fmt.Sprintf("rand-%d-%d-%d_%d.gz", w.initSize, w.initLen, w.round, len(result)), result)
   152  	fmt.Printf("done (size: %d)\n", len(result))
   153  }