github.com/uber/kraken@v0.1.4/lib/backend/hdfsbackend/client.go (about)

     1  // Copyright (c) 2016-2019 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  package hdfsbackend
    15  
    16  import (
    17  	"errors"
    18  	"fmt"
    19  	"io"
    20  	"path"
    21  	"regexp"
    22  	"sync"
    23  
    24  	"github.com/uber/kraken/core"
    25  	"github.com/uber/kraken/lib/backend"
    26  	"github.com/uber/kraken/lib/backend/hdfsbackend/webhdfs"
    27  	"github.com/uber/kraken/lib/backend/namepath"
    28  	"github.com/uber/kraken/utils/httputil"
    29  	"github.com/uber/kraken/utils/log"
    30  
    31  	"github.com/satori/go.uuid"
    32  	"gopkg.in/yaml.v2"
    33  )
    34  
    35  const _hdfs = "hdfs"
    36  
    37  func init() {
    38  	backend.Register(_hdfs, &factory{})
    39  }
    40  
    41  type factory struct{}
    42  
    43  func (f *factory) Create(
    44  	confRaw interface{}, authConfRaw interface{}) (backend.Client, error) {
    45  
    46  	confBytes, err := yaml.Marshal(confRaw)
    47  	if err != nil {
    48  		return nil, errors.New("marshal hdfs config")
    49  	}
    50  	var config Config
    51  	if err := yaml.Unmarshal(confBytes, &config); err != nil {
    52  		return nil, errors.New("unmarshal hdfs config")
    53  	}
    54  	return NewClient(config)
    55  }
    56  
    57  // Client is a backend.Client for HDFS.
    58  type Client struct {
    59  	config  Config
    60  	pather  namepath.Pather
    61  	webhdfs webhdfs.Client
    62  }
    63  
    64  // Option allows setting optional Client parameters.
    65  type Option func(*Client)
    66  
    67  // WithWebHDFS configures a Client with a custom webhdfs implementation.
    68  func WithWebHDFS(w webhdfs.Client) Option {
    69  	return func(c *Client) { c.webhdfs = w }
    70  }
    71  
    72  // NewClient creates a new Client for HDFS.
    73  func NewClient(config Config, opts ...Option) (*Client, error) {
    74  	config.applyDefaults()
    75  	if !path.IsAbs(config.RootDirectory) {
    76  		return nil, errors.New("invalid config: root_directory must be absolute path")
    77  	}
    78  	pather, err := namepath.New(config.RootDirectory, config.NamePath)
    79  	if err != nil {
    80  		return nil, fmt.Errorf("namepath: %s", err)
    81  	}
    82  	webhdfs, err := webhdfs.NewClient(config.WebHDFS, config.NameNodes, config.UserName)
    83  	if err != nil {
    84  		return nil, err
    85  	}
    86  	client := &Client{config, pather, webhdfs}
    87  	for _, opt := range opts {
    88  		opt(client)
    89  	}
    90  	return client, nil
    91  }
    92  
    93  // Stat returns blob info for name.
    94  func (c *Client) Stat(namespace, name string) (*core.BlobInfo, error) {
    95  	path, err := c.pather.BlobPath(name)
    96  	if err != nil {
    97  		return nil, fmt.Errorf("blob path: %s", err)
    98  	}
    99  	fs, err := c.webhdfs.GetFileStatus(path)
   100  	if err != nil {
   101  		return nil, err
   102  	}
   103  	return core.NewBlobInfo(fs.Length), nil
   104  }
   105  
   106  // Download downloads name into dst.
   107  func (c *Client) Download(namespace, name string, dst io.Writer) error {
   108  	path, err := c.pather.BlobPath(name)
   109  	if err != nil {
   110  		return fmt.Errorf("blob path: %s", err)
   111  	}
   112  	return c.webhdfs.Open(path, dst)
   113  }
   114  
   115  // Upload uploads src to name.
   116  func (c *Client) Upload(namespace, name string, src io.Reader) error {
   117  	uploadPath := path.Join(c.config.RootDirectory, c.config.UploadDirectory, uuid.NewV4().String())
   118  	blobPath, err := c.pather.BlobPath(name)
   119  	if err != nil {
   120  		return fmt.Errorf("blob path: %s", err)
   121  	}
   122  	if err := c.webhdfs.Create(uploadPath, src); err != nil {
   123  		return err
   124  	}
   125  	if err := c.webhdfs.Mkdirs(path.Dir(blobPath)); err != nil {
   126  		return err
   127  	}
   128  	return c.webhdfs.Rename(uploadPath, blobPath)
   129  }
   130  
   131  var (
   132  	_ignoreRegex = regexp.MustCompile(
   133  		"^.+/repositories/.+/(_layers|_uploads|_manifests/(revisions|tags/.+/index)).*")
   134  	_stopRegex = regexp.MustCompile("^.+/repositories/.+/_manifests$")
   135  )
   136  
   137  type listResult struct {
   138  	dir  string
   139  	list []webhdfs.FileStatus
   140  	err  error
   141  }
   142  
   143  func (c *Client) lister(done <-chan struct{}, listJobs <-chan string, results chan<- listResult) {
   144  	for {
   145  		select {
   146  		case <-done:
   147  			return
   148  		case dir := <-listJobs:
   149  			l, err := c.webhdfs.ListFileStatus(dir)
   150  			select {
   151  			case <-done:
   152  				return
   153  			case results <- listResult{dir, l, err}:
   154  			}
   155  		}
   156  	}
   157  }
   158  
   159  func (c *Client) sendAll(done <-chan struct{}, dirs []string, listJobs chan<- string) {
   160  	for _, d := range dirs {
   161  		select {
   162  		case <-done:
   163  			return
   164  		case listJobs <- d:
   165  		}
   166  	}
   167  }
   168  
   169  // List lists names which start with prefix.
   170  func (c *Client) List(prefix string, opts ...backend.ListOption) (*backend.ListResult, error) {
   171  	options := backend.DefaultListOptions()
   172  	for _, opt := range opts {
   173  		opt(options)
   174  	}
   175  
   176  	if options.Paginated {
   177  		return nil, errors.New("pagination not supported")
   178  	}
   179  
   180  	root := path.Join(c.pather.BasePath(), prefix)
   181  
   182  	listJobs := make(chan string)
   183  	results := make(chan listResult)
   184  	done := make(chan struct{})
   185  
   186  	var wg sync.WaitGroup
   187  
   188  	for i := 0; i < c.config.ListConcurrency; i++ {
   189  		wg.Add(1)
   190  		go func() {
   191  			c.lister(done, listJobs, results)
   192  			wg.Done()
   193  		}()
   194  	}
   195  
   196  	defer func() {
   197  		close(done)
   198  		if c.config.testing {
   199  			// Waiting might be delayed if an early error is encountered but
   200  			// other goroutines are waiting on a long http timeout. Thus, we
   201  			// only wait for each spawned goroutine to exit during testing to
   202  			// assert that no goroutines leak.
   203  			wg.Wait()
   204  		}
   205  	}()
   206  
   207  	var files []string
   208  
   209  	// Pending tracks the number of directories which are pending exploration.
   210  	// Invariant: there will be a result received for every increment made to
   211  	// pending.
   212  	pending := 1
   213  	listJobs <- root
   214  
   215  	for pending > 0 {
   216  		res := <-results
   217  		pending--
   218  		if res.err != nil {
   219  			if httputil.IsNotFound(res.err) {
   220  				continue
   221  			}
   222  			return nil, res.err
   223  		}
   224  		var dirs []string
   225  		for _, fs := range res.list {
   226  			p := path.Join(res.dir, fs.PathSuffix)
   227  
   228  			// TODO(codyg): This is an ugly hack to avoid walking through non-tags
   229  			// during Docker catalog. Ideally, only tags are located in the repositories
   230  			// directory, however in WBU2 HDFS, there are blobs here as well. At some
   231  			// point, we must migrate the data into a structure which cleanly divides
   232  			// blobs and tags (like we do in S3).
   233  			if _ignoreRegex.MatchString(p) {
   234  				continue
   235  			}
   236  
   237  			// TODO(codyg): Another ugly hack to speed up catalog performance by stopping
   238  			// early when we hit tags...
   239  			if _stopRegex.MatchString(p) {
   240  				p = path.Join(p, "tags/dummy/current/link")
   241  				fs.Type = "FILE"
   242  			}
   243  
   244  			if fs.Type == "DIRECTORY" {
   245  				// Flat directory structures are common, so accumulate directories and send
   246  				// them to the listers in a single goroutine (as opposed to a goroutine per
   247  				// directory).
   248  				dirs = append(dirs, p)
   249  			} else {
   250  				name, err := c.pather.NameFromBlobPath(p)
   251  				if err != nil {
   252  					log.With("path", p).Errorf("Error converting blob path into name: %s", err)
   253  					continue
   254  				}
   255  				files = append(files, name)
   256  			}
   257  		}
   258  		if len(dirs) > 0 {
   259  			// We cannot send list jobs and receive results in the same thread, else
   260  			// deadlock will occur.
   261  			wg.Add(1)
   262  			go func() {
   263  				c.sendAll(done, dirs, listJobs)
   264  				wg.Done()
   265  			}()
   266  			pending += len(dirs)
   267  		}
   268  	}
   269  
   270  	return &backend.ListResult{
   271  		Names: files,
   272  	},  nil
   273  }