github.com/uber/kraken@v0.1.4/lib/backend/hdfsbackend/client.go (about) 1 // Copyright (c) 2016-2019 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 package hdfsbackend 15 16 import ( 17 "errors" 18 "fmt" 19 "io" 20 "path" 21 "regexp" 22 "sync" 23 24 "github.com/uber/kraken/core" 25 "github.com/uber/kraken/lib/backend" 26 "github.com/uber/kraken/lib/backend/hdfsbackend/webhdfs" 27 "github.com/uber/kraken/lib/backend/namepath" 28 "github.com/uber/kraken/utils/httputil" 29 "github.com/uber/kraken/utils/log" 30 31 "github.com/satori/go.uuid" 32 "gopkg.in/yaml.v2" 33 ) 34 35 const _hdfs = "hdfs" 36 37 func init() { 38 backend.Register(_hdfs, &factory{}) 39 } 40 41 type factory struct{} 42 43 func (f *factory) Create( 44 confRaw interface{}, authConfRaw interface{}) (backend.Client, error) { 45 46 confBytes, err := yaml.Marshal(confRaw) 47 if err != nil { 48 return nil, errors.New("marshal hdfs config") 49 } 50 var config Config 51 if err := yaml.Unmarshal(confBytes, &config); err != nil { 52 return nil, errors.New("unmarshal hdfs config") 53 } 54 return NewClient(config) 55 } 56 57 // Client is a backend.Client for HDFS. 58 type Client struct { 59 config Config 60 pather namepath.Pather 61 webhdfs webhdfs.Client 62 } 63 64 // Option allows setting optional Client parameters. 65 type Option func(*Client) 66 67 // WithWebHDFS configures a Client with a custom webhdfs implementation. 68 func WithWebHDFS(w webhdfs.Client) Option { 69 return func(c *Client) { c.webhdfs = w } 70 } 71 72 // NewClient creates a new Client for HDFS. 73 func NewClient(config Config, opts ...Option) (*Client, error) { 74 config.applyDefaults() 75 if !path.IsAbs(config.RootDirectory) { 76 return nil, errors.New("invalid config: root_directory must be absolute path") 77 } 78 pather, err := namepath.New(config.RootDirectory, config.NamePath) 79 if err != nil { 80 return nil, fmt.Errorf("namepath: %s", err) 81 } 82 webhdfs, err := webhdfs.NewClient(config.WebHDFS, config.NameNodes, config.UserName) 83 if err != nil { 84 return nil, err 85 } 86 client := &Client{config, pather, webhdfs} 87 for _, opt := range opts { 88 opt(client) 89 } 90 return client, nil 91 } 92 93 // Stat returns blob info for name. 94 func (c *Client) Stat(namespace, name string) (*core.BlobInfo, error) { 95 path, err := c.pather.BlobPath(name) 96 if err != nil { 97 return nil, fmt.Errorf("blob path: %s", err) 98 } 99 fs, err := c.webhdfs.GetFileStatus(path) 100 if err != nil { 101 return nil, err 102 } 103 return core.NewBlobInfo(fs.Length), nil 104 } 105 106 // Download downloads name into dst. 107 func (c *Client) Download(namespace, name string, dst io.Writer) error { 108 path, err := c.pather.BlobPath(name) 109 if err != nil { 110 return fmt.Errorf("blob path: %s", err) 111 } 112 return c.webhdfs.Open(path, dst) 113 } 114 115 // Upload uploads src to name. 116 func (c *Client) Upload(namespace, name string, src io.Reader) error { 117 uploadPath := path.Join(c.config.RootDirectory, c.config.UploadDirectory, uuid.NewV4().String()) 118 blobPath, err := c.pather.BlobPath(name) 119 if err != nil { 120 return fmt.Errorf("blob path: %s", err) 121 } 122 if err := c.webhdfs.Create(uploadPath, src); err != nil { 123 return err 124 } 125 if err := c.webhdfs.Mkdirs(path.Dir(blobPath)); err != nil { 126 return err 127 } 128 return c.webhdfs.Rename(uploadPath, blobPath) 129 } 130 131 var ( 132 _ignoreRegex = regexp.MustCompile( 133 "^.+/repositories/.+/(_layers|_uploads|_manifests/(revisions|tags/.+/index)).*") 134 _stopRegex = regexp.MustCompile("^.+/repositories/.+/_manifests$") 135 ) 136 137 type listResult struct { 138 dir string 139 list []webhdfs.FileStatus 140 err error 141 } 142 143 func (c *Client) lister(done <-chan struct{}, listJobs <-chan string, results chan<- listResult) { 144 for { 145 select { 146 case <-done: 147 return 148 case dir := <-listJobs: 149 l, err := c.webhdfs.ListFileStatus(dir) 150 select { 151 case <-done: 152 return 153 case results <- listResult{dir, l, err}: 154 } 155 } 156 } 157 } 158 159 func (c *Client) sendAll(done <-chan struct{}, dirs []string, listJobs chan<- string) { 160 for _, d := range dirs { 161 select { 162 case <-done: 163 return 164 case listJobs <- d: 165 } 166 } 167 } 168 169 // List lists names which start with prefix. 170 func (c *Client) List(prefix string, opts ...backend.ListOption) (*backend.ListResult, error) { 171 options := backend.DefaultListOptions() 172 for _, opt := range opts { 173 opt(options) 174 } 175 176 if options.Paginated { 177 return nil, errors.New("pagination not supported") 178 } 179 180 root := path.Join(c.pather.BasePath(), prefix) 181 182 listJobs := make(chan string) 183 results := make(chan listResult) 184 done := make(chan struct{}) 185 186 var wg sync.WaitGroup 187 188 for i := 0; i < c.config.ListConcurrency; i++ { 189 wg.Add(1) 190 go func() { 191 c.lister(done, listJobs, results) 192 wg.Done() 193 }() 194 } 195 196 defer func() { 197 close(done) 198 if c.config.testing { 199 // Waiting might be delayed if an early error is encountered but 200 // other goroutines are waiting on a long http timeout. Thus, we 201 // only wait for each spawned goroutine to exit during testing to 202 // assert that no goroutines leak. 203 wg.Wait() 204 } 205 }() 206 207 var files []string 208 209 // Pending tracks the number of directories which are pending exploration. 210 // Invariant: there will be a result received for every increment made to 211 // pending. 212 pending := 1 213 listJobs <- root 214 215 for pending > 0 { 216 res := <-results 217 pending-- 218 if res.err != nil { 219 if httputil.IsNotFound(res.err) { 220 continue 221 } 222 return nil, res.err 223 } 224 var dirs []string 225 for _, fs := range res.list { 226 p := path.Join(res.dir, fs.PathSuffix) 227 228 // TODO(codyg): This is an ugly hack to avoid walking through non-tags 229 // during Docker catalog. Ideally, only tags are located in the repositories 230 // directory, however in WBU2 HDFS, there are blobs here as well. At some 231 // point, we must migrate the data into a structure which cleanly divides 232 // blobs and tags (like we do in S3). 233 if _ignoreRegex.MatchString(p) { 234 continue 235 } 236 237 // TODO(codyg): Another ugly hack to speed up catalog performance by stopping 238 // early when we hit tags... 239 if _stopRegex.MatchString(p) { 240 p = path.Join(p, "tags/dummy/current/link") 241 fs.Type = "FILE" 242 } 243 244 if fs.Type == "DIRECTORY" { 245 // Flat directory structures are common, so accumulate directories and send 246 // them to the listers in a single goroutine (as opposed to a goroutine per 247 // directory). 248 dirs = append(dirs, p) 249 } else { 250 name, err := c.pather.NameFromBlobPath(p) 251 if err != nil { 252 log.With("path", p).Errorf("Error converting blob path into name: %s", err) 253 continue 254 } 255 files = append(files, name) 256 } 257 } 258 if len(dirs) > 0 { 259 // We cannot send list jobs and receive results in the same thread, else 260 // deadlock will occur. 261 wg.Add(1) 262 go func() { 263 c.sendAll(done, dirs, listJobs) 264 wg.Done() 265 }() 266 pending += len(dirs) 267 } 268 } 269 270 return &backend.ListResult{ 271 Names: files, 272 }, nil 273 }