github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/surf/testdata/gen.go (about) 1 package main 2 3 import ( 4 "bufio" 5 "bytes" 6 "compress/gzip" 7 "encoding/binary" 8 "fmt" 9 "math/rand" 10 "os" 11 "sort" 12 "time" 13 14 "github.com/brianvoe/gofakeit" 15 ) 16 17 func main() { 18 gofakeit.Seed(time.Now().Unix()) 19 20 fakeSet := []fakeData{ 21 {"street", func() string { return gofakeit.Address().Address }, 10000000}, 22 {"url", gofakeit.URL, 10000000}, 23 {"email", gofakeit.Email, 10000000}, 24 {"uuid", gofakeit.UUID, 10000000}, 25 {"ipv4", gofakeit.IPv4Address, 10000000}, 26 {"ipv6", gofakeit.IPv6Address, 20000000}, 27 {"username", gofakeit.Username, 4000000}, 28 } 29 for _, w := range fakeSet { 30 w.generate() 31 } 32 33 randSet := []randomData{ 34 {10000000, 10, 0}, // dense dataset 35 {10000000, 100, 0}, // sparse dataset 36 {100000, 100, 3}, // sparse prefix dataset 37 {200, 5, 300}, // dense prefix dataset 38 } 39 for _, w := range randSet { 40 w.generate() 41 } 42 } 43 44 type fakeData struct { 45 name string 46 f func() string 47 n int 48 } 49 50 func (w *fakeData) generate() { 51 fmt.Printf("generating %d %s...", w.n, w.name) 52 dedup := make(map[string]struct{}, w.n) 53 progress := 10 54 for len(dedup) < w.n { 55 dedup[w.f()] = struct{}{} 56 57 p := int(float64(len(dedup)) / float64(w.n) * 100.0) 58 if p%10 == 0 && p >= progress { 59 fmt.Printf("%d...", progress) 60 progress += 10 61 } 62 } 63 64 fmt.Print("sorting...") 65 keys := make([][]byte, 0, w.n) 66 for k := range dedup { 67 keys = append(keys, []byte(k)) 68 } 69 dedup = nil 70 sort.Slice(keys, func(i, j int) bool { return bytes.Compare(keys[i], keys[j]) < 0 }) 71 72 fmt.Print("writing...") 73 output(fmt.Sprintf("%s_%d.gz", w.name, w.n), keys) 74 fmt.Println("done") 75 } 76 77 func output(filename string, data [][]byte) { 78 f, err := os.OpenFile(filename, os.O_CREATE|os.O_TRUNC|os.O_RDWR, os.ModePerm) 79 if err != nil { 80 panic(err) 81 } 82 defer f.Close() 83 buf := bufio.NewWriter(f) 84 defer buf.Flush() 85 compressed := gzip.NewWriter(buf) 86 defer compressed.Close() 87 88 for _, b := range data { 89 if len(b) > 65535 { 90 panic("key length overflow") 91 } 92 var lenBuf [2]byte 93 binary.LittleEndian.PutUint16(lenBuf[:], uint16(len(b))) 94 95 _, err := compressed.Write(lenBuf[:]) 96 if err != nil { 97 panic(err) 98 } 99 _, err = compressed.Write(b) 100 if err != nil { 101 panic(err) 102 } 103 } 104 } 105 106 type randomData struct { 107 initSize, initLen, round int 108 } 109 110 func (w *randomData) generate() { 111 fmt.Printf("generating %d-%d-%d rand data...", w.initSize, w.initLen, w.round) 112 start := time.Now() 113 keys := make([][]byte, w.initSize) 114 rand := rand.New(rand.NewSource(start.Unix())) 115 fmt.Printf("init round...") 116 for i := range keys { 117 keys[i] = make([]byte, rand.Intn(w.initLen)+1) 118 rand.Read(keys[i]) 119 } 120 121 for r := 1; r <= w.round; r++ { 122 for i := 0; i < w.initSize*r; i++ { 123 k := make([]byte, len(keys[i])+rand.Intn(w.initLen)+1) 124 copy(k, keys[i]) 125 rand.Read(k[len(keys[i]):]) 126 keys = append(keys, k) 127 } 128 fmt.Printf("round %d...", r) 129 } 130 131 fmt.Print("sorting...") 132 sort.Slice(keys, func(i, j int) bool { 133 return bytes.Compare(keys[i], keys[j]) < 0 134 }) 135 136 fmt.Print("dedup...") 137 var prev []byte 138 result := keys[:0] 139 for _, k := range keys { 140 if bytes.Equal(prev, k) { 141 continue 142 } 143 prev = k 144 result = append(result, k) 145 } 146 for i := len(result); i < len(keys); i++ { 147 keys[i] = nil 148 } 149 150 fmt.Print("writing...") 151 output(fmt.Sprintf("rand-%d-%d-%d_%d.gz", w.initSize, w.initLen, w.round, len(result)), result) 152 fmt.Printf("done (size: %d)\n", len(result)) 153 }