github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/merge_shards.go (about) 1 /* 2 * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package bulk 18 19 import ( 20 "fmt" 21 "os" 22 "path/filepath" 23 "sort" 24 "strings" 25 26 "github.com/dgraph-io/dgraph/x" 27 ) 28 29 const ( 30 mapShardDir = "map_output" 31 reduceShardDir = "shards" 32 bufferDir = "buffer" 33 ) 34 35 func mergeMapShardsIntoReduceShards(opt *options) { 36 if opt == nil { 37 fmt.Printf("Nil options passed to merge shards phase.\n") 38 os.Exit(1) 39 } 40 41 shardDirs := readShardDirs(filepath.Join(opt.TmpDir, mapShardDir)) 42 if len(shardDirs) == 0 { 43 fmt.Printf( 44 "No map shards found. Possibly caused by empty data files passed to the bulk loader.\n") 45 os.Exit(1) 46 } 47 48 // First shard is handled differently because it contains reserved predicates. 49 firstShard := shardDirs[0] 50 // Sort the rest of the shards by size to allow the largest shards to be shuffled first. 51 shardDirs = shardDirs[1:] 52 sortBySize(shardDirs) 53 54 var reduceShards []string 55 for i := 0; i < opt.ReduceShards; i++ { 56 shardDir := filepath.Join(opt.TmpDir, reduceShardDir, fmt.Sprintf("shard_%d", i)) 57 x.Check(os.MkdirAll(shardDir, 0750)) 58 reduceShards = append(reduceShards, shardDir) 59 } 60 61 // Put the first map shard in the first reduce shard since it contains all the reserved 62 // predicates. We want all the reserved predicates in group 1. 63 reduceShard := filepath.Join(reduceShards[0], filepath.Base(firstShard)) 64 fmt.Printf("Shard %s -> Reduce %s\n", firstShard, reduceShard) 65 x.Check(os.Rename(firstShard, reduceShard)) 66 67 // Heuristic: put the largest map shard into the smallest reduce shard 68 // until there are no more map shards left. Should be a good approximation. 69 for _, shard := range shardDirs { 70 sortBySize(reduceShards) 71 reduceShard := filepath.Join( 72 reduceShards[len(reduceShards)-1], filepath.Base(shard)) 73 fmt.Printf("Shard %s -> Reduce %s\n", shard, reduceShard) 74 x.Check(os.Rename(shard, reduceShard)) 75 } 76 } 77 78 func readShardDirs(d string) []string { 79 _, err := os.Stat(d) 80 if os.IsNotExist(err) { 81 return nil 82 } 83 dir, err := os.Open(d) 84 x.Check(err) 85 shards, err := dir.Readdirnames(0) 86 x.Check(err) 87 x.Check(dir.Close()) 88 for i, shard := range shards { 89 shards[i] = filepath.Join(d, shard) 90 } 91 sort.Strings(shards) 92 return shards 93 } 94 95 func filenamesInTree(dir string) []string { 96 var fnames []string 97 x.Check(filepath.Walk(dir, func(path string, fi os.FileInfo, err error) error { 98 if err != nil { 99 return err 100 } 101 if strings.HasSuffix(path, ".gz") { 102 fnames = append(fnames, path) 103 } 104 return nil 105 })) 106 return fnames 107 } 108 109 type sizedDir struct { 110 dir string 111 sz int64 112 } 113 114 // sortBySize sorts the input directories by size of their content (biggest to smallest). 115 func sortBySize(dirs []string) { 116 sizedDirs := make([]sizedDir, len(dirs)) 117 for i, dir := range dirs { 118 sizedDirs[i] = sizedDir{dir: dir, sz: treeSize(dir)} 119 } 120 sort.SliceStable(sizedDirs, func(i, j int) bool { 121 return sizedDirs[i].sz > sizedDirs[j].sz 122 }) 123 for i := range sizedDirs { 124 dirs[i] = sizedDirs[i].dir 125 } 126 } 127 128 func treeSize(dir string) int64 { 129 var sum int64 130 x.Check(filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 131 if err != nil { 132 return err 133 } 134 sum += info.Size() 135 return nil 136 })) 137 return sum 138 }