github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/merge_shards.go (about) 1 /* 2 * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package bulk 18 19 import ( 20 "fmt" 21 "os" 22 "path/filepath" 23 "sort" 24 "strings" 25 26 "github.com/dgraph-io/dgraph/x" 27 ) 28 29 func mergeMapShardsIntoReduceShards(opt options) { 30 mapShards := shardDirs(opt.TmpDir) 31 32 var reduceShards []string 33 for i := 0; i < opt.ReduceShards; i++ { 34 shardDir := filepath.Join(opt.TmpDir, "shards", fmt.Sprintf("shard_%d", i)) 35 x.Check(os.MkdirAll(shardDir, 0755)) 36 reduceShards = append(reduceShards, shardDir) 37 } 38 39 // Heuristic: put the largest map shard into the smallest reduce shard 40 // until there are no more map shards left. Should be a good approximation. 41 for _, shard := range mapShards { 42 sortBySize(reduceShards) 43 reduceShard := filepath.Join( 44 reduceShards[len(reduceShards)-1], filepath.Base(shard)) 45 fmt.Printf("Shard %s -> Reduce %s\n", shard, reduceShard) 46 x.Check(os.Rename(shard, reduceShard)) 47 } 48 } 49 50 func shardDirs(tmpDir string) []string { 51 dir, err := os.Open(filepath.Join(tmpDir, "shards")) 52 x.Check(err) 53 shards, err := dir.Readdirnames(0) 54 x.Check(err) 55 dir.Close() 56 for i, shard := range shards { 57 shards[i] = filepath.Join(tmpDir, "shards", shard) 58 } 59 60 // Allow largest shards to be shuffled first. 61 sortBySize(shards) 62 return shards 63 } 64 65 func filenamesInTree(dir string) []string { 66 var fnames []string 67 x.Check(filepath.Walk(dir, func(path string, fi os.FileInfo, err error) error { 68 if err != nil { 69 return err 70 } 71 if strings.HasSuffix(path, ".gz") { 72 fnames = append(fnames, path) 73 } 74 return nil 75 })) 76 return fnames 77 } 78 79 type sizedDir struct { 80 dir string 81 sz int64 82 } 83 84 // sortBySize sorts the input directories by size of their content (biggest to smallest). 85 func sortBySize(dirs []string) { 86 sizedDirs := make([]sizedDir, len(dirs)) 87 for i, dir := range dirs { 88 sizedDirs[i] = sizedDir{dir: dir, sz: treeSize(dir)} 89 } 90 sort.SliceStable(sizedDirs, func(i, j int) bool { 91 return sizedDirs[i].sz > sizedDirs[j].sz 92 }) 93 for i := range sizedDirs { 94 dirs[i] = sizedDirs[i].dir 95 } 96 } 97 98 func treeSize(dir string) int64 { 99 var sum int64 100 x.Check(filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 101 if err != nil { 102 return err 103 } 104 sum += info.Size() 105 return nil 106 })) 107 return sum 108 }