github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/merge_shards.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"fmt"
    21  	"os"
    22  	"path/filepath"
    23  	"sort"
    24  	"strings"
    25  
    26  	"github.com/dgraph-io/dgraph/x"
    27  )
    28  
    29  func mergeMapShardsIntoReduceShards(opt options) {
    30  	mapShards := shardDirs(opt.TmpDir)
    31  
    32  	var reduceShards []string
    33  	for i := 0; i < opt.ReduceShards; i++ {
    34  		shardDir := filepath.Join(opt.TmpDir, "shards", fmt.Sprintf("shard_%d", i))
    35  		x.Check(os.MkdirAll(shardDir, 0755))
    36  		reduceShards = append(reduceShards, shardDir)
    37  	}
    38  
    39  	// Heuristic: put the largest map shard into the smallest reduce shard
    40  	// until there are no more map shards left. Should be a good approximation.
    41  	for _, shard := range mapShards {
    42  		sortBySize(reduceShards)
    43  		reduceShard := filepath.Join(
    44  			reduceShards[len(reduceShards)-1], filepath.Base(shard))
    45  		fmt.Printf("Shard %s -> Reduce %s\n", shard, reduceShard)
    46  		x.Check(os.Rename(shard, reduceShard))
    47  	}
    48  }
    49  
    50  func shardDirs(tmpDir string) []string {
    51  	dir, err := os.Open(filepath.Join(tmpDir, "shards"))
    52  	x.Check(err)
    53  	shards, err := dir.Readdirnames(0)
    54  	x.Check(err)
    55  	dir.Close()
    56  	for i, shard := range shards {
    57  		shards[i] = filepath.Join(tmpDir, "shards", shard)
    58  	}
    59  
    60  	// Allow largest shards to be shuffled first.
    61  	sortBySize(shards)
    62  	return shards
    63  }
    64  
    65  func filenamesInTree(dir string) []string {
    66  	var fnames []string
    67  	x.Check(filepath.Walk(dir, func(path string, fi os.FileInfo, err error) error {
    68  		if err != nil {
    69  			return err
    70  		}
    71  		if strings.HasSuffix(path, ".gz") {
    72  			fnames = append(fnames, path)
    73  		}
    74  		return nil
    75  	}))
    76  	return fnames
    77  }
    78  
    79  type sizedDir struct {
    80  	dir string
    81  	sz  int64
    82  }
    83  
    84  // sortBySize sorts the input directories by size of their content (biggest to smallest).
    85  func sortBySize(dirs []string) {
    86  	sizedDirs := make([]sizedDir, len(dirs))
    87  	for i, dir := range dirs {
    88  		sizedDirs[i] = sizedDir{dir: dir, sz: treeSize(dir)}
    89  	}
    90  	sort.SliceStable(sizedDirs, func(i, j int) bool {
    91  		return sizedDirs[i].sz > sizedDirs[j].sz
    92  	})
    93  	for i := range sizedDirs {
    94  		dirs[i] = sizedDirs[i].dir
    95  	}
    96  }
    97  
    98  func treeSize(dir string) int64 {
    99  	var sum int64
   100  	x.Check(filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
   101  		if err != nil {
   102  			return err
   103  		}
   104  		sum += info.Size()
   105  		return nil
   106  	}))
   107  	return sum
   108  }