github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/merge_shards.go (about)

     1  /*
     2   * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"fmt"
    21  	"os"
    22  	"path/filepath"
    23  	"sort"
    24  	"strings"
    25  
    26  	"github.com/dgraph-io/dgraph/x"
    27  )
    28  
    29  const (
    30  	mapShardDir    = "map_output"
    31  	reduceShardDir = "shards"
    32  	bufferDir      = "buffer"
    33  )
    34  
    35  func mergeMapShardsIntoReduceShards(opt *options) {
    36  	if opt == nil {
    37  		fmt.Printf("Nil options passed to merge shards phase.\n")
    38  		os.Exit(1)
    39  	}
    40  
    41  	shardDirs := readShardDirs(filepath.Join(opt.TmpDir, mapShardDir))
    42  	if len(shardDirs) == 0 {
    43  		fmt.Printf(
    44  			"No map shards found. Possibly caused by empty data files passed to the bulk loader.\n")
    45  		os.Exit(1)
    46  	}
    47  
    48  	// First shard is handled differently because it contains reserved predicates.
    49  	firstShard := shardDirs[0]
    50  	// Sort the rest of the shards by size to allow the largest shards to be shuffled first.
    51  	shardDirs = shardDirs[1:]
    52  	sortBySize(shardDirs)
    53  
    54  	var reduceShards []string
    55  	for i := 0; i < opt.ReduceShards; i++ {
    56  		shardDir := filepath.Join(opt.TmpDir, reduceShardDir, fmt.Sprintf("shard_%d", i))
    57  		x.Check(os.MkdirAll(shardDir, 0750))
    58  		reduceShards = append(reduceShards, shardDir)
    59  	}
    60  
    61  	// Put the first map shard in the first reduce shard since it contains all the reserved
    62  	// predicates. We want all the reserved predicates in group 1.
    63  	reduceShard := filepath.Join(reduceShards[0], filepath.Base(firstShard))
    64  	fmt.Printf("Shard %s -> Reduce %s\n", firstShard, reduceShard)
    65  	x.Check(os.Rename(firstShard, reduceShard))
    66  
    67  	// Heuristic: put the largest map shard into the smallest reduce shard
    68  	// until there are no more map shards left. Should be a good approximation.
    69  	for _, shard := range shardDirs {
    70  		sortBySize(reduceShards)
    71  		reduceShard := filepath.Join(
    72  			reduceShards[len(reduceShards)-1], filepath.Base(shard))
    73  		fmt.Printf("Shard %s -> Reduce %s\n", shard, reduceShard)
    74  		x.Check(os.Rename(shard, reduceShard))
    75  	}
    76  }
    77  
    78  func readShardDirs(d string) []string {
    79  	_, err := os.Stat(d)
    80  	if os.IsNotExist(err) {
    81  		return nil
    82  	}
    83  	dir, err := os.Open(d)
    84  	x.Check(err)
    85  	shards, err := dir.Readdirnames(0)
    86  	x.Check(err)
    87  	x.Check(dir.Close())
    88  	for i, shard := range shards {
    89  		shards[i] = filepath.Join(d, shard)
    90  	}
    91  	sort.Strings(shards)
    92  	return shards
    93  }
    94  
    95  func filenamesInTree(dir string) []string {
    96  	var fnames []string
    97  	x.Check(filepath.Walk(dir, func(path string, fi os.FileInfo, err error) error {
    98  		if err != nil {
    99  			return err
   100  		}
   101  		if strings.HasSuffix(path, ".gz") {
   102  			fnames = append(fnames, path)
   103  		}
   104  		return nil
   105  	}))
   106  	return fnames
   107  }
   108  
   109  type sizedDir struct {
   110  	dir string
   111  	sz  int64
   112  }
   113  
   114  // sortBySize sorts the input directories by size of their content (biggest to smallest).
   115  func sortBySize(dirs []string) {
   116  	sizedDirs := make([]sizedDir, len(dirs))
   117  	for i, dir := range dirs {
   118  		sizedDirs[i] = sizedDir{dir: dir, sz: treeSize(dir)}
   119  	}
   120  	sort.SliceStable(sizedDirs, func(i, j int) bool {
   121  		return sizedDirs[i].sz > sizedDirs[j].sz
   122  	})
   123  	for i := range sizedDirs {
   124  		dirs[i] = sizedDirs[i].dir
   125  	}
   126  }
   127  
   128  func treeSize(dir string) int64 {
   129  	var sum int64
   130  	x.Check(filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
   131  		if err != nil {
   132  			return err
   133  		}
   134  		sum += info.Size()
   135  		return nil
   136  	}))
   137  	return sum
   138  }