github.com/rohankumardubey/proxyfs@v0.0.0-20210108201508-653efa9ab00e/fs/api_internal.go (about) 1 // Package fs, sitting on top of the inode manager, defines the filesystem exposed by ProxyFS. 2 package fs 3 4 import ( 5 "bytes" 6 "container/list" 7 "fmt" 8 "math" 9 "path" 10 "strings" 11 "syscall" 12 "time" 13 14 "github.com/swiftstack/ProxyFS/blunder" 15 "github.com/swiftstack/ProxyFS/dlm" 16 "github.com/swiftstack/ProxyFS/inode" 17 "github.com/swiftstack/ProxyFS/logger" 18 "github.com/swiftstack/ProxyFS/utils" 19 ) 20 21 // Shorthand for our internal API debug log id; global to the package 22 var internalDebug = logger.DbgInternal 23 24 type symlinkFollowState struct { 25 seen map[inode.InodeNumber]bool 26 traversed int 27 } 28 29 // Let us sort an array of directory and file names 30 type dirAndFileName struct { 31 dirName string 32 fileName string 33 } 34 35 // this has to be a named type to be a method receiver 36 type dirAndFileNameSlice []dirAndFileName 37 38 func (coll dirAndFileNameSlice) Len() int { 39 return len(coll) 40 } 41 42 func (coll dirAndFileNameSlice) Less(i int, j int) bool { 43 return coll[i].dirName < coll[j].dirName 44 } 45 46 func (coll dirAndFileNameSlice) Swap(i int, j int) { 47 coll[i], coll[j] = coll[j], coll[i] 48 } 49 50 // trackInFlightFileInodeData is called to ensure a timely Flush occurs. 51 // 52 // Only Write() will call this while holding a WriteLock on the fileInode 53 // either just before or just after its call to inode.Write(). 54 func (vS *volumeStruct) trackInFlightFileInodeData(inodeNumber inode.InodeNumber) { 55 var ( 56 inFlightFileInodeData *inFlightFileInodeDataStruct 57 ok bool 58 ) 59 60 globals.Lock() 61 vS.dataMutex.Lock() 62 inFlightFileInodeData, ok = vS.inFlightFileInodeDataMap[inodeNumber] 63 if !ok { 64 inFlightFileInodeData = &inFlightFileInodeDataStruct{ 65 InodeNumber: inodeNumber, 66 volStruct: vS, 67 control: make(chan bool, inFlightFileInodeDataControlBuffering), 68 } 69 vS.inFlightFileInodeDataMap[inodeNumber] = inFlightFileInodeData 70 inFlightFileInodeData.globalsListElement = globals.inFlightFileInodeDataList.PushBack(inFlightFileInodeData) 71 inFlightFileInodeData.wg.Add(1) 72 go inFlightFileInodeData.inFlightFileInodeDataTracker() 73 } 74 vS.dataMutex.Unlock() 75 globals.Unlock() 76 } 77 78 // untrackInFlightInodeData is called once it is known a Flush() is no longer needed 79 // or to actually request a Flush() [as would be the case during unmounting a volume]. 80 func (vS *volumeStruct) untrackInFlightFileInodeData(inodeNumber inode.InodeNumber, flushFirst bool) { 81 var ( 82 inFlightFileInodeData *inFlightFileInodeDataStruct 83 ok bool 84 ) 85 86 globals.Lock() 87 vS.dataMutex.Lock() 88 inFlightFileInodeData, ok = vS.inFlightFileInodeDataMap[inodeNumber] 89 if !ok { 90 vS.dataMutex.Unlock() 91 globals.Unlock() 92 return 93 } 94 delete(vS.inFlightFileInodeDataMap, inodeNumber) 95 if nil != inFlightFileInodeData.globalsListElement { 96 _ = globals.inFlightFileInodeDataList.Remove(inFlightFileInodeData.globalsListElement) 97 inFlightFileInodeData.globalsListElement = nil 98 } 99 inFlightFileInodeData.control <- flushFirst 100 vS.dataMutex.Unlock() 101 globals.Unlock() 102 if flushFirst { 103 inFlightFileInodeData.wg.Wait() 104 } 105 } 106 107 // untrackInFlightFileInodeDataAll is called to flush all current elements 108 // of vS.inFlightFileInodeDataMap (if any) during SIGHUP or Down(). 109 func (vS *volumeStruct) untrackInFlightFileInodeDataAll() { 110 var ( 111 inFlightFileInodeNumber inode.InodeNumber 112 inFlightFileInodeNumbers []inode.InodeNumber 113 inFlightFileInodeNumbersCapacity int 114 ) 115 116 // Snapshot list of inode.InodeNumber's currently in vS.inFlightFileInodeDataMap 117 118 vS.dataMutex.Lock() 119 inFlightFileInodeNumbersCapacity = len(vS.inFlightFileInodeDataMap) 120 if 0 == inFlightFileInodeNumbersCapacity { 121 vS.dataMutex.Unlock() 122 return 123 } 124 inFlightFileInodeNumbers = make([]inode.InodeNumber, 0, inFlightFileInodeNumbersCapacity) 125 for inFlightFileInodeNumber, _ = range vS.inFlightFileInodeDataMap { 126 inFlightFileInodeNumbers = append(inFlightFileInodeNumbers, inFlightFileInodeNumber) 127 } 128 vS.dataMutex.Unlock() 129 130 // Now go flush each of those 131 132 for _, inFlightFileInodeNumber = range inFlightFileInodeNumbers { 133 vS.untrackInFlightFileInodeData(inFlightFileInodeNumber, true) 134 } 135 } 136 137 func (vS *volumeStruct) inFlightFileInodeDataFlusher(inodeNumber inode.InodeNumber) { 138 var ( 139 err error 140 inodeLock *dlm.RWLockStruct 141 stillExists bool 142 ) 143 144 // Act as if a package fs client called Flush()... 145 146 inodeLock, err = vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 147 if nil != err { 148 logger.PanicfWithError(err, "InitInodeLock() for volume '%s' inode %v failed", vS.volumeName, inodeNumber) 149 } 150 err = inodeLock.WriteLock() 151 if nil != err { 152 logger.PanicfWithError(err, "dlm.Writelock() for volume '%s' inode %v failed", vS.volumeName, inodeNumber) 153 } 154 155 stillExists = vS.inodeVolumeHandle.Access(inodeNumber, inode.InodeRootUserID, inode.InodeGroupID(0), nil, inode.F_OK, 156 inode.NoOverride) 157 if stillExists { 158 err = vS.inodeVolumeHandle.Flush(inodeNumber, false) 159 if nil == err { 160 vS.untrackInFlightFileInodeData(inodeNumber, false) 161 } else { 162 logger.ErrorfWithError(err, "Flush of file data failed on volume '%s' inode %v", vS.volumeName, inodeNumber) 163 } 164 } 165 166 err = inodeLock.Unlock() 167 if nil != err { 168 logger.PanicfWithError(err, "dlm.Unlock() for volume '%s' inode %v failed", vS.volumeName, inodeNumber) 169 } 170 } 171 172 func (inFlightFileInodeData *inFlightFileInodeDataStruct) inFlightFileInodeDataTracker() { 173 var ( 174 flushFirst bool 175 ) 176 177 logger.Tracef("fs.inFlightFileInodeDataTracker(): waiting to flush volume '%s' inode %v", 178 inFlightFileInodeData.volStruct.volumeName, inFlightFileInodeData.InodeNumber) 179 180 select { 181 case flushFirst = <-inFlightFileInodeData.control: 182 // All we needed was the value of flushFirst from control chan 183 case <-time.After(inFlightFileInodeData.volStruct.maxFlushTime): 184 flushFirst = true 185 } 186 187 logger.Tracef("fs.inFlightFileInodeDataTracker(): flush starting for volume '%s' inode %v flushfirst %t", 188 inFlightFileInodeData.volStruct.volumeName, inFlightFileInodeData.InodeNumber, flushFirst) 189 190 if flushFirst { 191 inFlightFileInodeData.volStruct.inFlightFileInodeDataFlusher(inFlightFileInodeData.InodeNumber) 192 } 193 194 inFlightFileInodeData.wg.Done() 195 } 196 197 func fetchVolumeHandleByAccountName(accountName string) (volumeHandle VolumeHandle, err error) { 198 var ( 199 ok bool 200 vS *volumeStruct 201 volumeName string 202 ) 203 204 startTime := time.Now() 205 defer func() { 206 globals.FetchVolumeHandleUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 207 if err != nil { 208 globals.FetchVolumeHandleErrors.Add(1) 209 } 210 }() 211 212 globals.Lock() 213 214 volumeName, ok = inode.AccountNameToVolumeName(accountName) 215 if !ok { 216 err = fmt.Errorf("Unknown accountName passed to mountByAccountName(): \"%s\"", accountName) 217 err = blunder.AddError(err, blunder.NotFoundError) 218 globals.Unlock() 219 return 220 } 221 222 vS, ok = globals.volumeMap[volumeName] 223 if !ok { 224 err = fmt.Errorf("Unknown volumeName computed by mountByAccountName(): \"%s\"", volumeName) 225 err = blunder.AddError(err, blunder.NotFoundError) 226 globals.Unlock() 227 return 228 } 229 230 globals.Unlock() 231 232 volumeHandle = vS 233 err = nil 234 235 return 236 } 237 238 func fetchVolumeHandleByVolumeName(volumeName string) (volumeHandle VolumeHandle, err error) { 239 var ( 240 ok bool 241 vS *volumeStruct 242 ) 243 244 startTime := time.Now() 245 defer func() { 246 globals.FetchVolumeHandleUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 247 if err != nil { 248 globals.FetchVolumeHandleErrors.Add(1) 249 } 250 }() 251 252 globals.Lock() 253 254 vS, ok = globals.volumeMap[volumeName] 255 if !ok { 256 err = fmt.Errorf("Unknown volumeName passed to mountByVolumeName(): \"%s\"", volumeName) 257 err = blunder.AddError(err, blunder.NotFoundError) 258 globals.Unlock() 259 return 260 } 261 262 globals.Unlock() 263 264 volumeHandle = vS 265 err = nil 266 267 return 268 } 269 270 func (vS *volumeStruct) Access(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, accessMode inode.InodeMode) (accessReturn bool) { 271 startTime := time.Now() 272 defer func() { 273 globals.AccessUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 274 }() 275 276 vS.jobRWMutex.RLock() 277 defer vS.jobRWMutex.RUnlock() 278 279 accessReturn = vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, accessMode, 280 inode.NoOverride) 281 return 282 } 283 284 func (vS *volumeStruct) CallInodeToProvisionObject() (pPath string, err error) { 285 startTime := time.Now() 286 defer func() { 287 globals.CallInodeToProvisionObjectUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 288 if err != nil { 289 globals.CallInodeToProvisionObjectErrors.Add(1) 290 } 291 }() 292 293 vS.jobRWMutex.RLock() 294 defer vS.jobRWMutex.RUnlock() 295 296 pPath, err = vS.inodeVolumeHandle.ProvisionObject() 297 return 298 } 299 300 func (vS *volumeStruct) Create(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, dirInodeNumber inode.InodeNumber, basename string, filePerm inode.InodeMode) (fileInodeNumber inode.InodeNumber, err error) { 301 startTime := time.Now() 302 defer func() { 303 globals.CreateUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 304 if err != nil { 305 globals.CreateErrors.Add(1) 306 } 307 }() 308 309 vS.jobRWMutex.RLock() 310 defer vS.jobRWMutex.RUnlock() 311 312 err = validateBaseName(basename) 313 if err != nil { 314 return 0, err 315 } 316 317 // Lock the directory inode before doing the link 318 dirInodeLock, err := vS.inodeVolumeHandle.InitInodeLock(dirInodeNumber, nil) 319 if err != nil { 320 return 0, err 321 } 322 err = dirInodeLock.WriteLock() 323 if err != nil { 324 return 0, err 325 } 326 defer dirInodeLock.Unlock() 327 328 if !vS.inodeVolumeHandle.Access(dirInodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 329 inode.NoOverride) { 330 return 0, blunder.NewError(blunder.NotFoundError, "ENOENT") 331 } 332 if !vS.inodeVolumeHandle.Access(dirInodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, 333 inode.NoOverride) { 334 return 0, blunder.NewError(blunder.PermDeniedError, "EACCES") 335 } 336 337 // create the file and add it to the directory 338 fileInodeNumber, err = vS.inodeVolumeHandle.CreateFile(filePerm, userID, groupID) 339 if err != nil { 340 return 0, err 341 } 342 343 err = vS.inodeVolumeHandle.Link(dirInodeNumber, basename, fileInodeNumber, false) 344 if err != nil { 345 destroyErr := vS.inodeVolumeHandle.Destroy(fileInodeNumber) 346 if destroyErr != nil { 347 logger.WarnfWithError(destroyErr, "couldn't destroy inode %v after failed Link() in fs.Create", fileInodeNumber) 348 } 349 return 0, err 350 } 351 352 return fileInodeNumber, nil 353 } 354 355 func (vS *volumeStruct) DefragmentFile(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, fileInodeNumber inode.InodeNumber) (err error) { 356 var ( 357 eofReached bool 358 fileOffset uint64 359 inodeLock *dlm.RWLockStruct 360 inodeType inode.InodeType 361 ) 362 363 startTime := time.Now() 364 defer func() { 365 globals.DefragmentFileUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 366 if err != nil { 367 globals.DefragmentFileErrors.Add(1) 368 } 369 }() 370 371 vS.jobRWMutex.RLock() 372 373 inodeLock, err = vS.inodeVolumeHandle.InitInodeLock(fileInodeNumber, nil) 374 if nil != err { 375 vS.jobRWMutex.RUnlock() 376 return 377 } 378 err = inodeLock.WriteLock() 379 if nil != err { 380 vS.jobRWMutex.RUnlock() 381 return 382 } 383 384 if !vS.inodeVolumeHandle.Access(fileInodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 385 inode.NoOverride) { 386 _ = inodeLock.Unlock() 387 vS.jobRWMutex.RUnlock() 388 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 389 return 390 } 391 if !vS.inodeVolumeHandle.Access(fileInodeNumber, userID, groupID, otherGroupIDs, inode.W_OK, 392 inode.OwnerOverride) { 393 _ = inodeLock.Unlock() 394 vS.jobRWMutex.RUnlock() 395 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 396 return 397 } 398 399 inodeType, err = vS.inodeVolumeHandle.GetType(fileInodeNumber) 400 if nil != err { 401 _ = inodeLock.Unlock() 402 vS.jobRWMutex.RUnlock() 403 logger.ErrorfWithError(err, "couldn't get type for inode %v", fileInodeNumber) 404 return 405 } 406 // Make sure the inode number is for a file inode 407 if inodeType != inode.FileType { 408 _ = inodeLock.Unlock() 409 vS.jobRWMutex.RUnlock() 410 err = fmt.Errorf("%s: expected inode %v to be a file inode, got %v", utils.GetFnName(), fileInodeNumber, inodeType) 411 logger.ErrorWithError(err) 412 err = blunder.AddError(err, blunder.NotFileError) 413 return 414 } 415 416 fileOffset = 0 417 418 for { 419 fileOffset, eofReached, err = vS.inodeVolumeHandle.DefragmentFile(fileInodeNumber, fileOffset, vS.fileDefragmentChunkSize) 420 _ = inodeLock.Unlock() 421 vS.jobRWMutex.RUnlock() 422 if nil != err { 423 return 424 } 425 if eofReached { 426 return 427 } 428 time.Sleep(vS.fileDefragmentChunkDelay) 429 vS.jobRWMutex.RLock() 430 err = inodeLock.WriteLock() 431 if nil != err { 432 vS.jobRWMutex.RUnlock() 433 return 434 } 435 } 436 } 437 438 func (vS *volumeStruct) FetchExtentMapChunk(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, fileInodeNumber inode.InodeNumber, fileOffset uint64, maxEntriesFromFileOffset int64, maxEntriesBeforeFileOffset int64) (extentMapChunk *inode.ExtentMapChunkStruct, err error) { 439 var ( 440 inodeLock *dlm.RWLockStruct 441 inodeType inode.InodeType 442 ) 443 444 startTime := time.Now() 445 defer func() { 446 globals.FetchExtentMapChunkUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 447 if err != nil { 448 globals.FetchExtentMapChunkErrors.Add(1) 449 } 450 }() 451 452 vS.jobRWMutex.RLock() 453 defer vS.jobRWMutex.RUnlock() 454 455 inodeLock, err = vS.inodeVolumeHandle.InitInodeLock(fileInodeNumber, nil) 456 if nil != err { 457 return 458 } 459 err = inodeLock.ReadLock() 460 if nil != err { 461 return 462 } 463 defer inodeLock.Unlock() 464 465 if !vS.inodeVolumeHandle.Access(fileInodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 466 inode.NoOverride) { 467 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 468 return 469 } 470 if !vS.inodeVolumeHandle.Access(fileInodeNumber, userID, groupID, otherGroupIDs, inode.R_OK, 471 inode.OwnerOverride) { 472 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 473 return 474 } 475 476 inodeType, err = vS.inodeVolumeHandle.GetType(fileInodeNumber) 477 if nil != err { 478 logger.ErrorfWithError(err, "couldn't get type for inode %v", fileInodeNumber) 479 return 480 } 481 // Make sure the inode number is for a file inode 482 if inodeType != inode.FileType { 483 err = fmt.Errorf("%s: expected inode %v to be a file inode, got %v", utils.GetFnName(), fileInodeNumber, inodeType) 484 logger.ErrorWithError(err) 485 err = blunder.AddError(err, blunder.NotFileError) 486 return 487 } 488 489 extentMapChunk, err = vS.inodeVolumeHandle.FetchExtentMapChunk(fileInodeNumber, fileOffset, maxEntriesFromFileOffset, maxEntriesBeforeFileOffset) 490 491 return 492 } 493 494 // doInlineCheckpointIfEnabled is called whenever we must guarantee that reported state changes 495 // are, indeed, persisted. Absent any sort of persistent transaction log, this means performing 496 // a checkpoint unfortunately. 497 // 498 // Currently, only explicitly invoked Flushes trigger this. But, actually, any Swift/S3 API call 499 // that modifies Objects or (what the client thinks are) Containers should also. 500 // 501 // TODO is to determine where else a call to this func should also be made. 502 // 503 func (vS *volumeStruct) doInlineCheckpointIfEnabled() { 504 var ( 505 err error 506 ) 507 508 if !vS.doCheckpointPerFlush { 509 return 510 } 511 512 err = vS.headhunterVolumeHandle.DoCheckpoint() 513 if nil != err { 514 logger.Fatalf("fs.doInlineCheckpoint() call to headhunter.DoCheckpoint() failed: %v", err) 515 } 516 } 517 518 func (vS *volumeStruct) Flush(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (err error) { 519 startTime := time.Now() 520 defer func() { 521 globals.FlushUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 522 if err != nil { 523 globals.FlushErrors.Add(1) 524 } 525 }() 526 527 vS.jobRWMutex.RLock() 528 defer vS.jobRWMutex.RUnlock() 529 530 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 531 if err != nil { 532 return 533 } 534 err = inodeLock.WriteLock() 535 if err != nil { 536 return 537 } 538 defer inodeLock.Unlock() 539 540 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 541 inode.NoOverride) { 542 return blunder.NewError(blunder.NotFoundError, "ENOENT") 543 } 544 545 // Note: We'd normally check EACCES here...but there are paths in FUSE (e.g. when files are 546 // closed) that end up calling Flush()...even though the file was "opened" ReadOnly. 547 // This is presumably to support updated of ATime and such. In any event, an EACCESS 548 // check would fail if the caller actually only had ReadOnly access to the Inode, so 549 // we won't be doing the check here. 550 551 err = vS.inodeVolumeHandle.Flush(inodeNumber, false) 552 vS.untrackInFlightFileInodeData(inodeNumber, false) 553 554 vS.doInlineCheckpointIfEnabled() 555 556 return 557 } 558 559 func (vS *volumeStruct) getFileLockList(inodeNumber inode.InodeNumber) (flockList *list.List) { 560 vS.dataMutex.Lock() 561 defer vS.dataMutex.Unlock() 562 563 flockList, ok := vS.FLockMap[inodeNumber] 564 if !ok { 565 flockList = new(list.List) 566 vS.FLockMap[inodeNumber] = flockList 567 } 568 569 return 570 } 571 572 // Check for lock conflict with other Pids, if there is a conflict then it will return the first occurance of conflicting range. 573 func checkConflict(elm *FlockStruct, flock *FlockStruct) bool { 574 575 if flock.Pid == elm.Pid { 576 return false 577 } 578 579 if (elm.Start+elm.Len) <= flock.Start || (flock.Start+flock.Len) <= elm.Start { 580 return false 581 } 582 583 if (flock.Type == syscall.F_WRLCK) || (elm.Type == syscall.F_WRLCK) { 584 return true 585 } 586 587 return false 588 } 589 590 func (vS *volumeStruct) verifyLock(inodeNumber inode.InodeNumber, flock *FlockStruct) (conflictLock *FlockStruct) { 591 flockList := vS.getFileLockList(inodeNumber) 592 593 for e := flockList.Front(); e != nil; e = e.Next() { 594 elm := e.Value.(*FlockStruct) 595 596 if checkConflict(elm, flock) == true { 597 return elm 598 } 599 } 600 601 return nil 602 } 603 604 // Insert a file lock range to corresponding lock list for the pid. 605 // Assumption: There is no lock conflict and the range that is being inserted has no conflict and is free. 606 func (vS *volumeStruct) fileLockInsert(inodeNumber inode.InodeNumber, inFlock *FlockStruct) (err error) { 607 err = nil 608 flockList := vS.getFileLockList(inodeNumber) 609 610 overlapList := new(list.List) 611 var beforeElm *list.Element // Refers to the immediate element that starts before the start of the range. 612 var afterElm *list.Element // Refers to the immediate element that starts after the end of the range. 613 614 // flockList is sorted by starting offset of the range. 615 // Inserting a range happens in two steps. 1) Check if there is any conflict and also identify the 616 // point in the list where the entry will be added (before and after elements) 2) Then check if 617 // the range can extend the before element, if so adjust it. 3) Simillarly, check if the after 618 // element can be collapsed if it forms a contiguous range. 619 620 for e := flockList.Front(); e != nil; e = e.Next() { 621 elm := e.Value.(*FlockStruct) 622 623 if (elm.Start + elm.Len) <= inFlock.Start { 624 beforeElm = e 625 continue 626 } 627 628 if elm.Start > (inFlock.Start + inFlock.Len) { 629 afterElm = e 630 if overlapList.Len() == 0 { 631 flockList.InsertBefore(inFlock, e) 632 return 633 } 634 635 break 636 } 637 638 if checkConflict(elm, inFlock) { 639 err = blunder.AddError(nil, blunder.TryAgainError) 640 return 641 } 642 643 if elm.Pid == inFlock.Pid { 644 overlapList.PushBack(e) 645 } 646 } 647 648 if overlapList.Len() == 0 { 649 if beforeElm != nil { 650 elm := beforeElm.Value.(*FlockStruct) 651 if elm.Pid == inFlock.Pid && elm.Type == inFlock.Type && (elm.Start+elm.Len) == inFlock.Start { 652 elm.Len = inFlock.Start + inFlock.Len - elm.Len 653 } else { 654 flockList.InsertAfter(inFlock, beforeElm) 655 } 656 } else { 657 flockList.PushBack(inFlock) 658 } 659 660 return 661 } 662 663 // Look at the last element in the overlapping list 664 lastEnt := overlapList.Back() 665 e := lastEnt.Value.(*list.Element) 666 elm := e.Value.(*FlockStruct) 667 if (elm.Start + elm.Len) > (inFlock.Start + inFlock.Len) { 668 inFlock.Len = (elm.Start + elm.Len) - inFlock.Start 669 } 670 671 // We can delete all the entries in the overlapping list. These entries are replaced by 672 // the range we are inserting. 673 for e := overlapList.Front(); e != nil; e = e.Next() { 674 entry := e.Value.(*list.Element) 675 flockList.Remove(entry) 676 } 677 678 // Now adjust the before and after entries: 679 // First adjust the after: 680 if afterElm != nil { 681 elm := afterElm.Value.(*FlockStruct) 682 if elm.Pid == inFlock.Pid && elm.Type == inFlock.Type && (inFlock.Start+inFlock.Len) == elm.Start { 683 // We can collapse the entry: 684 elm.Len = elm.Start + elm.Len - inFlock.Start 685 elm.Start = inFlock.Start 686 687 if beforeElm != nil { 688 belm := beforeElm.Value.(*FlockStruct) 689 if belm.Pid == elm.Pid && belm.Type == elm.Type && (belm.Start+belm.Len) == elm.Start { 690 belm.Len = elm.Start + elm.Len - belm.Start 691 flockList.Remove(afterElm) 692 } 693 } 694 695 return 696 } 697 } 698 699 if beforeElm != nil { 700 belm := beforeElm.Value.(*FlockStruct) 701 if belm.Pid == inFlock.Pid && belm.Type == inFlock.Type && (belm.Start+belm.Len) == inFlock.Start { 702 belm.Len = inFlock.Start + inFlock.Len - belm.Start 703 } 704 705 flockList.InsertAfter(inFlock, beforeElm) 706 return 707 } 708 709 if afterElm != nil { 710 flockList.InsertBefore(inFlock, afterElm) 711 } else { 712 flockList.PushBack(inFlock) 713 } 714 715 return 716 717 } 718 719 // Unlock a given range. All locks held in this range by the process (identified by Pid) are removed. 720 func (vS *volumeStruct) fileUnlock(inodeNumber inode.InodeNumber, inFlock *FlockStruct) (err error) { 721 722 flockList := vS.getFileLockList(inodeNumber) 723 if flockList == nil { 724 logger.Warnf("Unlock of a region not already locked - %+v", inFlock) 725 return 726 } 727 728 start := inFlock.Start 729 len := inFlock.Len 730 731 removeList := new(list.List) 732 733 for e := flockList.Front(); e != nil; e = e.Next() { 734 elm := e.Value.(*FlockStruct) 735 736 if elm.Pid != inFlock.Pid { 737 continue 738 } 739 740 if (elm.Start + elm.Len) < start { 741 continue 742 } 743 744 if elm.Start >= (start + len) { 745 break 746 } 747 748 // If the lock falls completely in the range, delete it. 749 if elm.Start >= start && (elm.Start+elm.Len) <= (start+len) { 750 removeList.PushBack(e) 751 continue 752 } 753 754 // This lock overlapps with the range - three possibalities 1) lock starts before the range, 2) end after range and 3) both. 755 756 elmLen := elm.Start + elm.Len // Save the original length, it is required in case of #3 (both) 757 758 if elm.Start < start { // Handle the first part - lock starts before the range. 759 elm.Len = start - elm.Start 760 } 761 762 if elmLen > (start + len) { // Lock extends beyond the unlock range. 763 if elm.Start > start { // case #2 764 // use the existing record 765 elm.Start = start + len 766 elm.Len = elmLen - elm.Start 767 break 768 } 769 770 // Create a new record - handle case #3 both (starts before the range and extends beyond the range) 771 elmTail := new(FlockStruct) 772 elmTail.Start = start + len 773 elmTail.Len = elmLen - elm.Start 774 elmTail.Pid = elm.Pid 775 elmTail.Type = elm.Type 776 elmTail.Whence = elm.Whence 777 flockList.InsertAfter(elmTail, e) 778 break 779 } 780 } 781 782 for e := removeList.Front(); e != nil; e = e.Next() { 783 elm := e.Value.(*list.Element) 784 flockList.Remove(elm) 785 } 786 787 return 788 } 789 790 // Implements file locking conforming to fcntl(2) locking description. F_SETLKW is not implemented. Supports F_SETLW and F_GETLW. 791 // whence: FS supports only SEEK_SET - starting from 0, since it does not manage file handles, caller is expected to supply the start and length relative to offset ZERO. 792 func (vS *volumeStruct) Flock(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, lockCmd int32, inFlock *FlockStruct) (outFlock *FlockStruct, err error) { 793 startTime := time.Now() 794 defer func() { 795 switch lockCmd { 796 797 case syscall.F_GETLK: 798 globals.FlockGetUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 799 if err != nil { 800 globals.FlockGetErrors.Add(1) 801 } 802 803 case syscall.F_SETLK: 804 if inFlock.Type == syscall.F_UNLCK { 805 globals.FlockUnlockUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 806 if err != nil { 807 globals.FlockUnlockErrors.Add(1) 808 } 809 810 } else if inFlock.Type == syscall.F_WRLCK || inFlock.Type == syscall.F_RDLCK { 811 globals.FlockLockUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 812 if err != nil { 813 globals.FlockLockErrors.Add(1) 814 } 815 } else { 816 globals.FlockOtherErrors.Add(1) 817 } 818 819 default: 820 globals.FlockOtherErrors.Add(1) 821 } 822 823 }() 824 825 vS.jobRWMutex.RLock() 826 defer vS.jobRWMutex.RUnlock() 827 828 outFlock = inFlock 829 830 if lockCmd == syscall.F_SETLKW { 831 err = blunder.AddError(nil, blunder.NotSupportedError) 832 return 833 } 834 835 // Make sure the inode does not go away, while we are applying the flock. 836 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 837 if err != nil { 838 return 839 } 840 err = inodeLock.ReadLock() 841 if err != nil { 842 return 843 } 844 defer inodeLock.Unlock() 845 846 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, inode.NoOverride) { 847 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 848 return 849 } 850 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.R_OK, inode.OwnerOverride) { 851 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 852 return 853 } 854 855 if inFlock.Len == 0 { // If length is ZERO means treat it as whole file. 856 inFlock.Len = ^uint64(0) 857 } 858 859 switch lockCmd { 860 case syscall.F_GETLK: 861 conflictLock := vS.verifyLock(inodeNumber, inFlock) 862 if conflictLock != nil { 863 outFlock = conflictLock 864 err = blunder.AddError(nil, blunder.TryAgainError) 865 } else { 866 outFlock = inFlock 867 outFlock.Type = syscall.F_UNLCK 868 } 869 break 870 871 case syscall.F_SETLK: 872 if inFlock.Type == syscall.F_UNLCK { 873 err = vS.fileUnlock(inodeNumber, inFlock) 874 875 } else if inFlock.Type == syscall.F_WRLCK || inFlock.Type == syscall.F_RDLCK { 876 err = vS.fileLockInsert(inodeNumber, inFlock) 877 878 } else { 879 err = blunder.NewError(blunder.InvalidArgError, "EINVAL") 880 return 881 } 882 break 883 884 default: 885 err = blunder.NewError(blunder.InvalidArgError, "EINVAL") 886 return 887 } 888 889 return 890 } 891 892 func (vS *volumeStruct) getstatHelper(inodeNumber inode.InodeNumber, callerID dlm.CallerID) (stat Stat, err error) { 893 894 lockID, err := vS.inodeVolumeHandle.MakeLockID(inodeNumber) 895 if err != nil { 896 return 897 } 898 if !dlm.IsLockHeld(lockID, callerID, dlm.ANYLOCK) { 899 err = fmt.Errorf("%s: inode %v lock must be held before calling", utils.GetFnName(), inodeNumber) 900 return nil, blunder.AddError(err, blunder.NotFoundError) 901 } 902 903 stat, err = vS.getstatHelperWhileLocked(inodeNumber) 904 905 return 906 } 907 908 func (vS *volumeStruct) getstatHelperWhileLocked(inodeNumber inode.InodeNumber) (stat Stat, err error) { 909 var ( 910 metadata *inode.MetadataStruct 911 ) 912 913 metadata, err = vS.inodeVolumeHandle.GetMetadata(inodeNumber) 914 if nil != err { 915 return 916 } 917 918 stat = make(map[StatKey]uint64) 919 920 stat[StatCRTime] = uint64(metadata.CreationTime.UnixNano()) 921 stat[StatMTime] = uint64(metadata.ModificationTime.UnixNano()) 922 stat[StatCTime] = uint64(metadata.AttrChangeTime.UnixNano()) 923 stat[StatATime] = uint64(metadata.AccessTime.UnixNano()) 924 stat[StatSize] = metadata.Size 925 stat[StatNLink] = metadata.LinkCount 926 stat[StatFType] = uint64(metadata.InodeType) 927 stat[StatINum] = uint64(inodeNumber) 928 stat[StatMode] = uint64(metadata.Mode) 929 stat[StatUserID] = uint64(metadata.UserID) 930 stat[StatGroupID] = uint64(metadata.GroupID) 931 stat[StatNumWrites] = metadata.NumWrites 932 933 return 934 } 935 936 func (vS *volumeStruct) Getstat(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (stat Stat, err error) { 937 startTime := time.Now() 938 defer func() { 939 globals.GetstatUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 940 if err != nil { 941 globals.GetstatErrors.Add(1) 942 } 943 }() 944 945 vS.jobRWMutex.RLock() 946 defer vS.jobRWMutex.RUnlock() 947 948 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 949 if err != nil { 950 return 951 } 952 err = inodeLock.ReadLock() 953 if err != nil { 954 return 955 } 956 defer inodeLock.Unlock() 957 958 // Call getstat helper function to do the work 959 return vS.getstatHelper(inodeNumber, inodeLock.GetCallerID()) 960 } 961 962 func (vS *volumeStruct) getTypeHelper(inodeNumber inode.InodeNumber, callerID dlm.CallerID) (inodeType inode.InodeType, err error) { 963 964 lockID, err := vS.inodeVolumeHandle.MakeLockID(inodeNumber) 965 if err != nil { 966 return 967 } 968 if !dlm.IsLockHeld(lockID, callerID, dlm.ANYLOCK) { 969 err = fmt.Errorf("%s: inode %v lock must be held before calling.", utils.GetFnName(), inodeNumber) 970 err = blunder.AddError(err, blunder.NotFoundError) 971 return 972 } 973 974 inodeType, err = vS.inodeVolumeHandle.GetType(inodeNumber) 975 if err != nil { 976 logger.ErrorWithError(err, "couldn't get inode type") 977 return inodeType, err 978 } 979 return inodeType, nil 980 } 981 982 func (vS *volumeStruct) GetType(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (inodeType inode.InodeType, err error) { 983 startTime := time.Now() 984 defer func() { 985 globals.GetTypeUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 986 if err != nil { 987 globals.GetTypeErrors.Add(1) 988 } 989 }() 990 991 vS.jobRWMutex.RLock() 992 defer vS.jobRWMutex.RUnlock() 993 994 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 995 if err != nil { 996 return 997 } 998 err = inodeLock.ReadLock() 999 if err != nil { 1000 return 1001 } 1002 defer inodeLock.Unlock() 1003 1004 return vS.getTypeHelper(inodeNumber, inodeLock.GetCallerID()) 1005 } 1006 1007 func (vS *volumeStruct) GetXAttr(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, streamName string) (value []byte, err error) { 1008 startTime := time.Now() 1009 defer func() { 1010 globals.GetXAttrUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1011 if err != nil { 1012 globals.GetXAttrErrors.Add(1) 1013 } 1014 }() 1015 1016 vS.jobRWMutex.RLock() 1017 defer vS.jobRWMutex.RUnlock() 1018 1019 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 1020 if err != nil { 1021 return 1022 } 1023 err = inodeLock.ReadLock() 1024 if err != nil { 1025 return 1026 } 1027 defer inodeLock.Unlock() 1028 1029 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 1030 inode.NoOverride) { 1031 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 1032 return 1033 } 1034 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.R_OK, 1035 inode.OwnerOverride) { 1036 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 1037 return 1038 } 1039 1040 value, err = vS.inodeVolumeHandle.GetStream(inodeNumber, streamName) 1041 if err != nil { 1042 // Did not find the requested stream. However this isn't really an error since 1043 // samba will ask for acl-related streams and is fine with not finding them. 1044 logger.TracefWithError(err, "Failed to get XAttr %v of inode %v", streamName, inodeNumber) 1045 } 1046 1047 return 1048 } 1049 1050 func (vS *volumeStruct) IsDir(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (inodeIsDir bool, err error) { 1051 startTime := time.Now() 1052 defer func() { 1053 globals.IsDirUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1054 if err != nil { 1055 globals.IsDirErrors.Add(1) 1056 } 1057 }() 1058 1059 vS.jobRWMutex.RLock() 1060 defer vS.jobRWMutex.RUnlock() 1061 1062 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 1063 if err != nil { 1064 return 1065 } 1066 err = inodeLock.ReadLock() 1067 if err != nil { 1068 return 1069 } 1070 defer inodeLock.Unlock() 1071 1072 lockID, err := vS.inodeVolumeHandle.MakeLockID(inodeNumber) 1073 if err != nil { 1074 return 1075 } 1076 if !dlm.IsLockHeld(lockID, inodeLock.GetCallerID(), dlm.ANYLOCK) { 1077 err = fmt.Errorf("%s: inode %v lock must be held before calling", utils.GetFnName(), inodeNumber) 1078 return false, blunder.AddError(err, blunder.NotFoundError) 1079 } 1080 1081 inodeType, err := vS.inodeVolumeHandle.GetType(inodeNumber) 1082 if err != nil { 1083 return false, err 1084 } 1085 return inodeType == inode.DirType, nil 1086 } 1087 1088 func (vS *volumeStruct) IsFile(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (inodeIsFile bool, err error) { 1089 startTime := time.Now() 1090 defer func() { 1091 globals.IsFileUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1092 if err != nil { 1093 globals.IsFileErrors.Add(1) 1094 } 1095 }() 1096 1097 vS.jobRWMutex.RLock() 1098 defer vS.jobRWMutex.RUnlock() 1099 1100 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 1101 if err != nil { 1102 return 1103 } 1104 err = inodeLock.ReadLock() 1105 if err != nil { 1106 return 1107 } 1108 defer inodeLock.Unlock() 1109 1110 inodeType, err := vS.inodeVolumeHandle.GetType(inodeNumber) 1111 if err != nil { 1112 return false, err 1113 } 1114 1115 return inodeType == inode.FileType, nil 1116 } 1117 1118 func (vS *volumeStruct) IsSymlink(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (inodeIsSymlink bool, err error) { 1119 startTime := time.Now() 1120 defer func() { 1121 globals.IsSymlinkUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1122 if err != nil { 1123 globals.IsSymlinkErrors.Add(1) 1124 } 1125 }() 1126 1127 vS.jobRWMutex.RLock() 1128 defer vS.jobRWMutex.RUnlock() 1129 1130 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 1131 if err != nil { 1132 return 1133 } 1134 err = inodeLock.ReadLock() 1135 if err != nil { 1136 return 1137 } 1138 defer inodeLock.Unlock() 1139 1140 inodeType, err := vS.inodeVolumeHandle.GetType(inodeNumber) 1141 if err != nil { 1142 return false, err 1143 } 1144 1145 return inodeType == inode.SymlinkType, nil 1146 } 1147 1148 func (vS *volumeStruct) Link(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, dirInodeNumber inode.InodeNumber, basename string, targetInodeNumber inode.InodeNumber) (err error) { 1149 startTime := time.Now() 1150 defer func() { 1151 globals.LinkUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1152 if err != nil { 1153 globals.LinkErrors.Add(1) 1154 } 1155 }() 1156 1157 vS.jobRWMutex.RLock() 1158 defer vS.jobRWMutex.RUnlock() 1159 1160 var ( 1161 inodeType inode.InodeType 1162 ) 1163 1164 err = validateBaseName(basename) 1165 if err != nil { 1166 return 1167 } 1168 1169 // We need both dirInodelock and the targetInode lock to make sure they 1170 // don't go away and linkCount is updated correctly. 1171 callerID := dlm.GenerateCallerID() 1172 dirInodeLock, err := vS.inodeVolumeHandle.InitInodeLock(dirInodeNumber, callerID) 1173 if err != nil { 1174 return 1175 } 1176 1177 targetInodeLock, err := vS.inodeVolumeHandle.InitInodeLock(targetInodeNumber, callerID) 1178 if err != nil { 1179 return 1180 } 1181 1182 // Lock the target inode to check its type and insure its not a directory (if it is a 1183 // directory then locking it after the target directory could result in deadlock). 1184 err = targetInodeLock.WriteLock() 1185 if err != nil { 1186 return 1187 } 1188 1189 // make sure target inode is not a directory 1190 inodeType, err = vS.inodeVolumeHandle.GetType(targetInodeNumber) 1191 if err != nil { 1192 targetInodeLock.Unlock() 1193 // Because we know that GetType() has already "blunderized" the error, we just pass it on 1194 logger.ErrorfWithError(err, "%s: couldn't get type for inode %v", utils.GetFnName(), targetInodeNumber) 1195 return err 1196 } 1197 if inodeType == inode.DirType { 1198 targetInodeLock.Unlock() 1199 // no need to print an error when its a mistake by the client 1200 err = fmt.Errorf("%s: inode %v cannot be a dir inode", utils.GetFnName(), targetInodeNumber) 1201 return blunder.AddError(err, blunder.LinkDirError) 1202 } 1203 1204 // drop the target inode lock so we can get the directory lock then 1205 // reget the target inode lock 1206 targetInodeLock.Unlock() 1207 1208 err = dirInodeLock.WriteLock() 1209 if err != nil { 1210 return 1211 } 1212 defer dirInodeLock.Unlock() 1213 1214 err = targetInodeLock.WriteLock() 1215 if err != nil { 1216 return 1217 } 1218 defer targetInodeLock.Unlock() 1219 1220 if !vS.inodeVolumeHandle.Access(dirInodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 1221 inode.NoOverride) { 1222 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 1223 return 1224 } 1225 if !vS.inodeVolumeHandle.Access(targetInodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 1226 inode.NoOverride) { 1227 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 1228 return 1229 } 1230 if !vS.inodeVolumeHandle.Access(dirInodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, 1231 inode.NoOverride) { 1232 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 1233 return 1234 } 1235 1236 err = vS.inodeVolumeHandle.Link(dirInodeNumber, basename, targetInodeNumber, false) 1237 1238 // if the link was successful and this is a regular file then any 1239 // pending data was flushed 1240 if err == nil && inodeType == inode.FileType { 1241 vS.untrackInFlightFileInodeData(targetInodeNumber, false) 1242 } 1243 1244 return err 1245 } 1246 1247 func (vS *volumeStruct) ListXAttr(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (streamNames []string, err error) { 1248 startTime := time.Now() 1249 defer func() { 1250 globals.ListXAttrUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1251 if err != nil { 1252 globals.ListXAttrErrors.Add(1) 1253 } 1254 }() 1255 1256 vS.jobRWMutex.RLock() 1257 defer vS.jobRWMutex.RUnlock() 1258 1259 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 1260 if err != nil { 1261 return 1262 } 1263 err = inodeLock.ReadLock() 1264 if err != nil { 1265 return 1266 } 1267 defer inodeLock.Unlock() 1268 1269 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 1270 inode.NoOverride) { 1271 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 1272 return 1273 } 1274 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.R_OK, 1275 inode.OwnerOverride) { 1276 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 1277 return 1278 } 1279 1280 metadata, err := vS.inodeVolumeHandle.GetMetadata(inodeNumber) 1281 if err != nil { 1282 // Did not find the requested stream. However this isn't really an error since 1283 // samba will ask for acl-related streams and is fine with not finding them. 1284 logger.TracefWithError(err, "Failed to list XAttrs of inode %v", inodeNumber) 1285 return 1286 } 1287 1288 streamNames = make([]string, len(metadata.InodeStreamNameSlice)) 1289 copy(streamNames, metadata.InodeStreamNameSlice) 1290 return 1291 } 1292 1293 func (vS *volumeStruct) Lookup(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, dirInodeNumber inode.InodeNumber, basename string) (inodeNumber inode.InodeNumber, err error) { 1294 startTime := time.Now() 1295 defer func() { 1296 globals.LookupUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1297 if err != nil { 1298 globals.LookupErrors.Add(1) 1299 } 1300 }() 1301 1302 vS.jobRWMutex.RLock() 1303 defer vS.jobRWMutex.RUnlock() 1304 1305 dirInodeLock, err := vS.inodeVolumeHandle.InitInodeLock(dirInodeNumber, nil) 1306 if err != nil { 1307 return 1308 } 1309 dirInodeLock.ReadLock() 1310 defer dirInodeLock.Unlock() 1311 1312 if !vS.inodeVolumeHandle.Access(dirInodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 1313 inode.NoOverride) { 1314 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 1315 return 1316 } 1317 if !vS.inodeVolumeHandle.Access(dirInodeNumber, userID, groupID, otherGroupIDs, inode.X_OK, 1318 inode.NoOverride) { 1319 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 1320 return 1321 } 1322 1323 inodeNumber, err = vS.inodeVolumeHandle.Lookup(dirInodeNumber, basename) 1324 return inodeNumber, err 1325 } 1326 1327 func (vS *volumeStruct) LookupPath(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, fullpath string) (inodeNumber inode.InodeNumber, err error) { 1328 startTime := time.Now() 1329 defer func() { 1330 globals.LookupPathUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1331 if err != nil { 1332 globals.LookupPathErrors.Add(1) 1333 } 1334 }() 1335 1336 vS.jobRWMutex.RLock() 1337 defer vS.jobRWMutex.RUnlock() 1338 1339 // In the special case of a fullpath starting with "/", the path segment splitting above 1340 // results in a first segment that still begins with "/". Because this is not recognized 1341 // as a real path segment, by the underlying code, we have trouble looking it up. 1342 // 1343 // This is a hack to work around this case until I figure out a better way. 1344 newfullpath := strings.TrimPrefix(fullpath, "/") 1345 if strings.Compare(fullpath, newfullpath) != 0 { 1346 fullpath = newfullpath 1347 } 1348 1349 pathSegments := strings.Split(path.Clean(fullpath), "/") 1350 1351 cursorInodeNumber := inode.RootDirInodeNumber 1352 for _, segment := range pathSegments { 1353 cursorInodeLock, err1 := vS.inodeVolumeHandle.InitInodeLock(cursorInodeNumber, nil) 1354 if err = err1; err != nil { 1355 return 1356 } 1357 err = cursorInodeLock.ReadLock() 1358 if err != nil { 1359 return 1360 } 1361 1362 if !vS.inodeVolumeHandle.Access(cursorInodeNumber, userID, groupID, otherGroupIDs, inode.X_OK, 1363 inode.NoOverride) { 1364 cursorInodeLock.Unlock() 1365 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 1366 return 1367 } 1368 1369 cursorInodeNumber, err = vS.inodeVolumeHandle.Lookup(cursorInodeNumber, segment) 1370 cursorInodeLock.Unlock() 1371 1372 if err != nil { 1373 return cursorInodeNumber, err 1374 } 1375 } 1376 1377 return cursorInodeNumber, nil 1378 } 1379 1380 func (vS *volumeStruct) MiddlewareCoalesce(destPath string, metaData []byte, elementPaths []string) ( 1381 ino uint64, numWrites uint64, attrChangeTime uint64, modificationTime uint64, err error) { 1382 1383 var ( 1384 coalesceElementList []*inode.CoalesceElement 1385 coalesceSize uint64 1386 ctime time.Time 1387 destFileInodeNumber inode.InodeNumber 1388 dirEntryBasename string 1389 dirEntryInodeNumber inode.InodeNumber 1390 dirInodeNumber inode.InodeNumber 1391 elementPathIndex int 1392 elementPathIndexAtChunkEnd int 1393 elementPathIndexAtChunkStart int 1394 heldLocks *heldLocksStruct 1395 mtime time.Time 1396 retryRequired bool 1397 tryLockBackoffContext *tryLockBackoffContextStruct 1398 ) 1399 1400 startTime := time.Now() 1401 defer func() { 1402 globals.MiddlewareCoalesceUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1403 globals.MiddlewareCoalesceBytes.Add(coalesceSize) 1404 if err != nil { 1405 globals.MiddlewareCoalesceErrors.Add(1) 1406 } 1407 }() 1408 1409 vS.jobRWMutex.RLock() 1410 defer vS.jobRWMutex.RUnlock() 1411 1412 // First create the destination file if necessary and ensure that it is empty 1413 1414 tryLockBackoffContext = &tryLockBackoffContextStruct{} 1415 1416 RestartDestinationFileCreation: 1417 1418 tryLockBackoffContext.backoff() 1419 1420 heldLocks = newHeldLocks() 1421 1422 _, destFileInodeNumber, _, _, retryRequired, err = 1423 vS.resolvePath( 1424 inode.RootDirInodeNumber, 1425 destPath, 1426 heldLocks, 1427 resolvePathFollowDirEntrySymlinks| 1428 resolvePathFollowDirSymlinks| 1429 resolvePathCreateMissingPathElements| 1430 resolvePathRequireExclusiveLockOnDirEntryInode) 1431 1432 if nil != err { 1433 heldLocks.free() 1434 return 1435 } 1436 1437 if retryRequired { 1438 heldLocks.free() 1439 goto RestartDestinationFileCreation 1440 } 1441 1442 vS.inodeVolumeHandle.SetSize(destFileInodeNumber, 0) 1443 1444 heldLocks.free() 1445 1446 // Now setup for looping through elementPaths with fresh locks 1447 // every globals.coalesceElementChunkSize elements holding an 1448 // Exclusive Lock on each FileInode and their containing DirInode 1449 1450 elementPathIndexAtChunkStart = 0 1451 1452 for elementPathIndexAtChunkStart < len(elementPaths) { 1453 elementPathIndexAtChunkEnd = elementPathIndexAtChunkStart + int(globals.coalesceElementChunkSize) 1454 if elementPathIndexAtChunkEnd > len(elementPaths) { 1455 elementPathIndexAtChunkEnd = len(elementPaths) 1456 } 1457 1458 // Coalesce elementPaths[elementPathIndexAtChunkStart:elementPathIndexAtChunkEnd) 1459 1460 tryLockBackoffContext = &tryLockBackoffContextStruct{} 1461 1462 RestartCoalesceChunk: 1463 1464 tryLockBackoffContext.backoff() 1465 1466 heldLocks = newHeldLocks() 1467 1468 coalesceElementList = make([]*inode.CoalesceElement, 0, (elementPathIndexAtChunkEnd - elementPathIndexAtChunkStart)) 1469 1470 for elementPathIndex = elementPathIndexAtChunkStart; elementPathIndex < elementPathIndexAtChunkEnd; elementPathIndex++ { 1471 dirInodeNumber, dirEntryInodeNumber, dirEntryBasename, _, retryRequired, err = 1472 vS.resolvePath( 1473 inode.RootDirInodeNumber, 1474 elementPaths[elementPathIndex], 1475 heldLocks, 1476 resolvePathFollowDirSymlinks| 1477 resolvePathRequireExclusiveLockOnDirEntryInode| 1478 resolvePathRequireExclusiveLockOnDirInode) 1479 1480 if nil != err { 1481 heldLocks.free() 1482 return 1483 } 1484 1485 if retryRequired { 1486 heldLocks.free() 1487 goto RestartCoalesceChunk 1488 } 1489 1490 coalesceElementList = append(coalesceElementList, &inode.CoalesceElement{ 1491 ContainingDirectoryInodeNumber: dirInodeNumber, 1492 ElementInodeNumber: dirEntryInodeNumber, 1493 ElementName: dirEntryBasename, 1494 }) 1495 } 1496 1497 _, destFileInodeNumber, _, _, retryRequired, err = 1498 vS.resolvePath( 1499 inode.RootDirInodeNumber, 1500 destPath, 1501 heldLocks, 1502 resolvePathFollowDirEntrySymlinks| 1503 resolvePathFollowDirSymlinks| 1504 resolvePathRequireExclusiveLockOnDirEntryInode) 1505 1506 if nil != err { 1507 heldLocks.free() 1508 return 1509 } 1510 1511 if retryRequired { 1512 heldLocks.free() 1513 goto RestartCoalesceChunk 1514 } 1515 1516 ctime, mtime, numWrites, coalesceSize, err = vS.inodeVolumeHandle.Coalesce( 1517 destFileInodeNumber, MiddlewareStream, metaData, coalesceElementList) 1518 1519 heldLocks.free() 1520 1521 if nil != err { 1522 return 1523 } 1524 1525 elementPathIndexAtChunkStart = elementPathIndexAtChunkEnd 1526 } 1527 1528 // Regardless of err return, fill in other return values 1529 1530 ino = uint64(destFileInodeNumber) 1531 attrChangeTime = uint64(ctime.UnixNano()) 1532 modificationTime = uint64(mtime.UnixNano()) 1533 1534 return 1535 } 1536 1537 func (vS *volumeStruct) MiddlewareDelete(parentDir string, basename string) (err error) { 1538 var ( 1539 dirEntryBasename string 1540 dirEntryInodeNumber inode.InodeNumber 1541 dirInodeNumber inode.InodeNumber 1542 doDestroy bool 1543 heldLocks *heldLocksStruct 1544 inodeType inode.InodeType 1545 inodeVolumeHandle inode.VolumeHandle 1546 linkCount uint64 1547 numDirEntries uint64 1548 retryRequired bool 1549 tryLockBackoffContext *tryLockBackoffContextStruct 1550 ) 1551 1552 startTime := time.Now() 1553 defer func() { 1554 globals.MiddlewareDeleteUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 1555 if err != nil { 1556 globals.MiddlewareDeleteErrors.Add(1) 1557 } 1558 }() 1559 1560 // Retry until done or failure (starting with ZERO backoff) 1561 1562 tryLockBackoffContext = &tryLockBackoffContextStruct{} 1563 1564 Restart: 1565 1566 // Perform backoff and update for each restart (starting with ZERO backoff of course) 1567 1568 tryLockBackoffContext.backoff() 1569 1570 // Construct fresh heldLocks for this restart 1571 1572 heldLocks = newHeldLocks() 1573 1574 dirInodeNumber, dirEntryInodeNumber, dirEntryBasename, _, retryRequired, err = 1575 vS.resolvePath( 1576 inode.RootDirInodeNumber, 1577 parentDir+"/"+basename, 1578 heldLocks, 1579 resolvePathFollowDirSymlinks| 1580 resolvePathRequireExclusiveLockOnDirEntryInode| 1581 resolvePathRequireExclusiveLockOnDirInode) 1582 1583 if nil != err { 1584 heldLocks.free() 1585 return 1586 } 1587 1588 if retryRequired { 1589 heldLocks.free() 1590 goto Restart 1591 } 1592 1593 // Check if Unlink() and Destroy() are doable 1594 1595 inodeVolumeHandle = vS.inodeVolumeHandle 1596 1597 inodeType, err = inodeVolumeHandle.GetType(dirEntryInodeNumber) 1598 if nil != err { 1599 heldLocks.free() 1600 return 1601 } 1602 1603 if inode.DirType == inodeType { 1604 numDirEntries, err = inodeVolumeHandle.NumDirEntries(dirEntryInodeNumber) 1605 if nil != err { 1606 heldLocks.free() 1607 return 1608 } 1609 1610 if 2 != numDirEntries { 1611 heldLocks.free() 1612 err = blunder.NewError(blunder.NotEmptyError, "%s/%s not empty", parentDir, basename) 1613 return 1614 } 1615 1616 doDestroy = true 1617 } else { 1618 linkCount, err = inodeVolumeHandle.GetLinkCount(dirEntryInodeNumber) 1619 if nil != err { 1620 heldLocks.free() 1621 return 1622 } 1623 1624 doDestroy = (1 == linkCount) 1625 } 1626 1627 // Now perform the Unlink() and (potentially) Destroy() 1628 1629 err = inodeVolumeHandle.Unlink(dirInodeNumber, dirEntryBasename, false) 1630 if nil != err { 1631 heldLocks.free() 1632 return 1633 } 1634 1635 if doDestroy { 1636 err = inodeVolumeHandle.Destroy(dirEntryInodeNumber) 1637 if nil != err { 1638 logger.Errorf("fs.MiddlewareDelete() failed to Destroy dirEntryInodeNumber 0x%016X: %v", dirEntryInodeNumber, err) 1639 } 1640 } 1641 1642 // Release heldLocks and exit with success (even if Destroy() failed earlier) 1643 1644 heldLocks.free() 1645 1646 err = nil 1647 return 1648 } 1649 1650 func (vS *volumeStruct) middlewareReadDirHelper(path string, maxEntries uint64, prevBasename string) (pathDirInodeNumber inode.InodeNumber, dirEntrySlice []inode.DirEntry, moreEntries bool, err error) { 1651 var ( 1652 dirEntrySliceElement inode.DirEntry 1653 heldLocks *heldLocksStruct 1654 internalDirEntrySlice []inode.DirEntry 1655 retryRequired bool 1656 tryLockBackoffContext *tryLockBackoffContextStruct 1657 ) 1658 1659 // Retry until done or failure (starting with ZERO backoff) 1660 1661 tryLockBackoffContext = &tryLockBackoffContextStruct{} 1662 1663 Restart: 1664 1665 // Perform backoff and update for each restart (starting with ZERO backoff of course) 1666 1667 tryLockBackoffContext.backoff() 1668 1669 // Construct fresh heldLocks for this restart 1670 1671 heldLocks = newHeldLocks() 1672 1673 _, pathDirInodeNumber, _, _, retryRequired, err = 1674 vS.resolvePath( 1675 inode.RootDirInodeNumber, 1676 path, 1677 heldLocks, 1678 resolvePathFollowDirSymlinks) 1679 1680 if nil != err { 1681 heldLocks.free() 1682 return 1683 } 1684 1685 if retryRequired { 1686 heldLocks.free() 1687 goto Restart 1688 } 1689 1690 // Now assemble response 1691 1692 internalDirEntrySlice, moreEntries, err = vS.inodeVolumeHandle.ReadDir(pathDirInodeNumber, maxEntries, 0, prevBasename) 1693 if nil != err { 1694 heldLocks.free() 1695 return 1696 } 1697 1698 // No need to hold any locks now... directory contents should be allowed to change while enumerating 1699 heldLocks.free() 1700 1701 dirEntrySlice = make([]inode.DirEntry, 0, len(internalDirEntrySlice)) 1702 1703 for _, dirEntrySliceElement = range internalDirEntrySlice { 1704 if ("." == dirEntrySliceElement.Basename) || (".." == dirEntrySliceElement.Basename) { 1705 dirEntrySliceElement.Type = inode.DirType 1706 } else { 1707 dirEntrySliceElement.Type, err = vS.GetType(inode.InodeRootUserID, inode.InodeGroupID(0), nil, dirEntrySliceElement.InodeNumber) 1708 if nil != err { 1709 // It's ok to have an error here... it just means the directory we are iterating is changing 1710 continue 1711 } 1712 } 1713 dirEntrySlice = append(dirEntrySlice, dirEntrySliceElement) 1714 } 1715 1716 dirEntrySlice = dirEntrySlice[:len(dirEntrySlice)] 1717 1718 err = nil 1719 return 1720 } 1721 1722 func (vS *volumeStruct) MiddlewareGetAccount(maxEntries uint64, marker string, endmarker string) (accountEnts []AccountEntry, mtime uint64, ctime uint64, err error) { 1723 var ( 1724 dirEntrySlice []inode.DirEntry 1725 dirEntrySliceElement inode.DirEntry 1726 remainingMaxEntries uint64 1727 moreEntries bool 1728 statResult Stat 1729 ) 1730 1731 statResult, err = vS.Getstat(inode.InodeRootUserID, inode.InodeGroupID(0), nil, inode.RootDirInodeNumber) 1732 if nil != err { 1733 return 1734 } 1735 mtime = statResult[StatMTime] 1736 ctime = statResult[StatCTime] 1737 1738 if 0 != maxEntries { 1739 // Hard limit to number of DirInode Basenames to return 1740 accountEnts = make([]AccountEntry, 0, maxEntries) 1741 } 1742 1743 remainingMaxEntries = maxEntries 1744 1745 moreEntries = true 1746 1747 for moreEntries { 1748 _, dirEntrySlice, moreEntries, err = vS.middlewareReadDirHelper("/", remainingMaxEntries, marker) 1749 if nil != err { 1750 return 1751 } 1752 1753 if 0 == maxEntries { 1754 // No limit to number of DirInode Basenames to return... so it must be <= len(dirEntrySlice) 1755 accountEnts = make([]AccountEntry, 0, len(dirEntrySlice)) 1756 // Note: moreEntries should be false so the "for moreEntries" loop should exit after 1st iteration 1757 } 1758 1759 for _, dirEntrySliceElement = range dirEntrySlice { 1760 if ("" != endmarker) && (0 <= strings.Compare(dirEntrySliceElement.Basename, endmarker)) { 1761 moreEntries = false 1762 break 1763 } 1764 if ("." != dirEntrySliceElement.Basename) && (".." != dirEntrySliceElement.Basename) { 1765 // So we've skipped "." & ".." - now also skip non-DirInodes 1766 if inode.DirType == dirEntrySliceElement.Type { 1767 statResult, err = vS.Getstat(inode.InodeRootUserID, inode.InodeGroupID(0), nil, dirEntrySliceElement.InodeNumber) 1768 if nil != err { 1769 return 1770 } 1771 accountEnts = append(accountEnts, AccountEntry{ 1772 Basename: dirEntrySliceElement.Basename, 1773 ModificationTime: statResult[StatMTime], 1774 AttrChangeTime: statResult[StatCTime], 1775 }) 1776 } 1777 } 1778 } 1779 1780 if moreEntries && (0 != maxEntries) { 1781 remainingMaxEntries = maxEntries - uint64(len(accountEnts)) 1782 if 0 == remainingMaxEntries { 1783 moreEntries = false 1784 } 1785 } 1786 1787 if moreEntries { 1788 // Adjust marker to fetch next dirEntrySlice 1789 marker = dirEntrySlice[len(dirEntrySlice)-1].Basename 1790 } 1791 } 1792 1793 accountEnts = accountEnts[:len(accountEnts)] 1794 1795 return 1796 } 1797 1798 type dirEntrySliceStackElementStruct struct { 1799 dirPath string 1800 dirEntrySlice []inode.DirEntry 1801 numConsumed int 1802 moreEntries bool 1803 } 1804 1805 func (vS *volumeStruct) MiddlewareGetContainer(vContainerName string, maxEntries uint64, marker string, endmarker string, prefix string, delimiter string) (containerEnts []ContainerEntry, err error) { 1806 var ( 1807 containerEntry ContainerEntry 1808 containerEntryBasename string // Misnamed... this is actually everything after ContainerName 1809 containerEntryPath string 1810 containerEntryPathSplit []string // Split on only the first '/' (to remove ContainerName from it) 1811 doSingleDirectory bool 1812 dirEntryInodeLock *dlm.RWLockStruct 1813 dirEntryInodeNumber inode.InodeNumber 1814 dirEntryInodeType inode.InodeType 1815 dirEntryMetadata *inode.MetadataStruct 1816 dirEntryPath string 1817 dirEntrySlice []inode.DirEntry 1818 dirEntrySliceElement inode.DirEntry 1819 dirEntrySliceElementIndex int 1820 dirEntrySliceElementToPrepend *inode.DirEntry 1821 dirEntrySliceStack []*dirEntrySliceStackElementStruct 1822 dirEntrySliceStackElement *dirEntrySliceStackElementStruct 1823 dirEntrySliceToAppend []inode.DirEntry 1824 dirInodeNumber inode.InodeNumber 1825 dirPath string 1826 dirPathSplit []string 1827 dlmCallerID dlm.CallerID 1828 endmarkerCanonicalized string 1829 endmarkerPath []string 1830 heldLocks *heldLocksStruct 1831 initialDirEntryToMatch string // == "" if no initial path should be returned (i.e. in marker starting point case) 1832 inodeVolumeHandle inode.VolumeHandle 1833 markerCanonicalized string 1834 markerPath []string 1835 markerPathDirInodeIndex int 1836 moreEntries bool 1837 pathIndex int 1838 prefixCanonicalized string 1839 prefixPath []string 1840 prefixPathDirInodeIndex int 1841 prevReturned string 1842 remainingMaxEntries uint64 1843 retryRequired bool 1844 tryLockBackoffContext *tryLockBackoffContextStruct 1845 ) 1846 1847 // Validate marker, endmarker, and prefix 1848 1849 if "" == marker { 1850 markerPath = []string{} 1851 markerPathDirInodeIndex = -1 // Must be special cased below to ensure we don't look in markerPath 1852 markerCanonicalized = "" // Actually never accessed 1853 } else { 1854 markerPath, markerPathDirInodeIndex, err = vS.canonicalizePathAndLocateLeafDirInode(vContainerName + "/" + marker) 1855 if nil != err { 1856 err = blunder.AddError(err, blunder.InvalidArgError) 1857 return 1858 } 1859 1860 markerCanonicalized = strings.Join(markerPath, "/") 1861 if strings.HasSuffix(marker, "/") { 1862 markerCanonicalized += "/" 1863 } 1864 1865 if vContainerName+"/"+marker != markerCanonicalized { 1866 err = blunder.NewError(blunder.InvalidArgError, "MiddlewareGetContainer() only supports a canonicalized marker") 1867 return 1868 } 1869 } 1870 1871 if "" == endmarker { 1872 endmarkerPath = []string{} 1873 endmarkerCanonicalized = "" // Actually never accessed 1874 } else { 1875 endmarkerPath, _, err = vS.canonicalizePathAndLocateLeafDirInode(vContainerName + "/" + endmarker) 1876 if nil != err { 1877 err = blunder.AddError(err, blunder.InvalidArgError) 1878 return 1879 } 1880 1881 endmarkerCanonicalized = strings.Join(endmarkerPath, "/") 1882 if strings.HasSuffix(endmarker, "/") { 1883 endmarkerCanonicalized += "/" 1884 } 1885 1886 if vContainerName+"/"+endmarker != endmarkerCanonicalized { 1887 err = blunder.NewError(blunder.InvalidArgError, "MiddlewareGetContainer() only supports a canonicalized endmarker") 1888 return 1889 } 1890 } 1891 1892 prefixPath, prefixPathDirInodeIndex, err = vS.canonicalizePathAndLocateLeafDirInode(vContainerName + "/" + prefix) 1893 if nil != err { 1894 err = blunder.AddError(err, blunder.InvalidArgError) 1895 return 1896 } 1897 if prefixPathDirInodeIndex < 0 { 1898 err = blunder.NewError(blunder.NotFoundError, "MiddlewareGetContainer() only supports querying an existing Container") 1899 return 1900 } 1901 1902 prefixCanonicalized = strings.Join(prefixPath, "/") 1903 if strings.HasSuffix(prefix, "/") { 1904 prefixCanonicalized += "/" 1905 } 1906 1907 if (prefix != "") && (vContainerName+"/"+prefix != prefixCanonicalized) { 1908 err = blunder.NewError(blunder.InvalidArgError, "MiddlewareGetContainer() only supports a canonicalized prefix") 1909 return 1910 } 1911 1912 // Validate delimiter 1913 1914 switch delimiter { 1915 case "": 1916 doSingleDirectory = false 1917 case "/": 1918 doSingleDirectory = true 1919 default: 1920 err = blunder.NewError(blunder.InvalidArgError, "MiddlewareGetContainer() only supports a delimiter of \"/\"") 1921 return 1922 } 1923 1924 // Determine what DirInode from which to begin our enumeration 1925 1926 pathIndex = 0 1927 1928 for { 1929 if (pathIndex > markerPathDirInodeIndex) && (pathIndex > prefixPathDirInodeIndex) { 1930 // Special (though probably typical) case where marker lands in prefix-indicated directory 1931 1932 dirPath = strings.Join(prefixPath[:prefixPathDirInodeIndex+1], "/") 1933 1934 if (1 == len(prefixPath)) || strings.HasSuffix(prefix, "/") { 1935 if (markerPathDirInodeIndex + 1) == len(markerPath) { 1936 prevReturned = "" 1937 } else { 1938 prevReturned = markerPath[markerPathDirInodeIndex+1] 1939 } 1940 initialDirEntryToMatch = "" 1941 } else { 1942 // Handle four remaining cases: 1943 // marker & prefix both specified directories 1944 // marker specified a directory, prefix did not 1945 // prefix specified a directory, marker did not 1946 // neither marker nor prefix specified a directory 1947 1948 if (markerPathDirInodeIndex + 1) == len(markerPath) { 1949 if (prefixPathDirInodeIndex + 1) == len(prefixPath) { 1950 // Case where marker & prefix both specified directories 1951 1952 prevReturned = "" 1953 } else { 1954 // Case where marker specified a directory, prefix did not 1955 1956 prevReturned = prefixPath[prefixPathDirInodeIndex+1] 1957 } 1958 initialDirEntryToMatch = prevReturned 1959 } else { // (markerPathDirInodeIndex + 1) != len(markerPath) 1960 if (prefixPathDirInodeIndex + 1) == len(prefixPath) { 1961 // Case where prefix specified a directory, marker did not 1962 1963 prevReturned = markerPath[markerPathDirInodeIndex+1] 1964 initialDirEntryToMatch = "" 1965 } else { 1966 // Case where neither marker nor prefix specified a directory 1967 1968 if strings.Compare(prefixPath[prefixPathDirInodeIndex+1], markerPath[markerPathDirInodeIndex+1]) <= 0 { 1969 prevReturned = markerPath[markerPathDirInodeIndex+1] 1970 initialDirEntryToMatch = "" 1971 } else { 1972 prevReturned = prefixPath[prefixPathDirInodeIndex+1] 1973 initialDirEntryToMatch = prevReturned 1974 } 1975 } 1976 } 1977 } 1978 break 1979 } 1980 1981 if pathIndex > markerPathDirInodeIndex { 1982 // Handle case where prefix is more constraining than marker 1983 1984 if prefixPathDirInodeIndex == (len(prefixPath) - 1) { 1985 if (1 == len(prefixPath)) || strings.HasSuffix(prefix, "/") { 1986 dirPath = strings.Join(prefixPath[:prefixPathDirInodeIndex+1], "/") 1987 prevReturned = "" 1988 } else { 1989 dirPath = strings.Join(prefixPath[:prefixPathDirInodeIndex], "/") 1990 prevReturned = prefixPath[len(prefixPath)-1] 1991 } 1992 } else { 1993 dirPath = strings.Join(prefixPath[:prefixPathDirInodeIndex+1], "/") 1994 prevReturned = prefixPath[len(prefixPath)-1] 1995 } 1996 initialDirEntryToMatch = prevReturned 1997 break 1998 } 1999 2000 if pathIndex > prefixPathDirInodeIndex { 2001 // Handle case where marker is more constraining than prefix 2002 2003 dirPath = strings.Join(markerPath[:markerPathDirInodeIndex+1], "/") 2004 if markerPathDirInodeIndex == (len(markerPath) - 1) { 2005 prevReturned = "" 2006 } else { 2007 prevReturned = markerPath[len(markerPath)-1] 2008 } 2009 initialDirEntryToMatch = "" 2010 break 2011 } 2012 2013 switch strings.Compare(prefixPath[pathIndex], markerPath[pathIndex]) { 2014 case -1: 2015 dirPath = strings.Join(markerPath[:markerPathDirInodeIndex+1], "/") 2016 if markerPathDirInodeIndex == (len(markerPath) - 1) { 2017 prevReturned = "" 2018 } else { 2019 prevReturned = markerPath[len(markerPath)-1] 2020 } 2021 initialDirEntryToMatch = "" 2022 break 2023 case 0: 2024 pathIndex++ 2025 case 1: 2026 if prefixPathDirInodeIndex == (len(prefixPath) - 1) { 2027 if (1 == len(prefixPath)) || strings.HasSuffix(prefix, "/") { 2028 dirPath = strings.Join(prefixPath[:prefixPathDirInodeIndex+1], "/") 2029 prevReturned = "" 2030 } else { 2031 dirPath = strings.Join(prefixPath[:prefixPathDirInodeIndex], "/") 2032 prevReturned = prefixPath[len(prefixPath)-1] 2033 } 2034 } else { 2035 dirPath = strings.Join(prefixPath[:prefixPathDirInodeIndex+1], "/") 2036 prevReturned = prefixPath[len(prefixPath)-1] 2037 } 2038 initialDirEntryToMatch = prevReturned 2039 break 2040 } 2041 } 2042 2043 // Setup shortcuts/contants 2044 2045 dlmCallerID = dlm.GenerateCallerID() 2046 inodeVolumeHandle = vS.inodeVolumeHandle 2047 2048 // Compute initial response 2049 2050 tryLockBackoffContext = &tryLockBackoffContextStruct{} 2051 2052 Restart: 2053 2054 tryLockBackoffContext.backoff() 2055 2056 heldLocks = newHeldLocks() 2057 2058 _, dirInodeNumber, _, _, retryRequired, err = 2059 vS.resolvePath( 2060 inode.RootDirInodeNumber, 2061 dirPath, 2062 heldLocks, 2063 resolvePathDirEntryInodeMustBeDirectory) 2064 if nil != err { 2065 heldLocks.free() 2066 return 2067 } 2068 if retryRequired { 2069 heldLocks.free() 2070 goto Restart 2071 } 2072 2073 containerEnts = make([]ContainerEntry, 0, maxEntries) 2074 2075 if 0 == maxEntries { 2076 heldLocks.free() 2077 err = nil 2078 return 2079 } 2080 2081 if "" == initialDirEntryToMatch { 2082 dirEntrySliceElementToPrepend = nil 2083 } else { 2084 if "" == dirPath { 2085 dirEntryPath = initialDirEntryToMatch 2086 } else { 2087 dirEntryPath = dirPath + "/" + initialDirEntryToMatch 2088 } 2089 if ("" != endmarker) && (strings.Compare(dirEntryPath, endmarkerCanonicalized) >= 0) { 2090 heldLocks.free() 2091 err = nil 2092 return 2093 } 2094 dirEntryInodeNumber, err = inodeVolumeHandle.Lookup(dirInodeNumber, initialDirEntryToMatch) 2095 if nil == err { 2096 retryRequired = heldLocks.attemptSharedLock(inodeVolumeHandle, dlmCallerID, dirEntryInodeNumber) 2097 if retryRequired { 2098 heldLocks.free() 2099 goto Restart 2100 } 2101 dirEntryInodeType, err = inodeVolumeHandle.GetType(dirEntryInodeNumber) 2102 if nil == err { 2103 dirEntrySliceElementToPrepend = &inode.DirEntry{ 2104 InodeNumber: dirEntryInodeNumber, 2105 Basename: initialDirEntryToMatch, 2106 Type: dirEntryInodeType, 2107 } 2108 } else { 2109 dirEntrySliceElementToPrepend = nil 2110 } 2111 heldLocks.unlock(dirEntryInodeNumber) 2112 } else { 2113 dirEntrySliceElementToPrepend = nil 2114 } 2115 } 2116 2117 heldLocks.free() 2118 2119 if 0 == maxEntries { 2120 remainingMaxEntries = 0 2121 } else { 2122 if nil == dirEntrySliceElementToPrepend { 2123 remainingMaxEntries = maxEntries 2124 } else { 2125 remainingMaxEntries = maxEntries - 1 2126 } 2127 } 2128 2129 // At this point: 2130 // no heldLocks 2131 // containerEnts has been declared 2132 // doSingleDirectory is set based on supplied delimiter 2133 // if {marker,endmarker,prefix} asked to include an exact matched path that existed, it's in dirEntrySliceElementToPrepend 2134 // prefixCanonicalized & endmarkerCanonicalized are set to terminate the ensuing treewalk 2135 // remainingMaxEntries indicates how many more DirEntry's will fit in containerEnts (if capped) 2136 // dirPath is pointing to the initial DirInode to read 2137 // prevReturned indicates from where in the DirInode to start reading 2138 2139 // Perform initial ReadDir and place in dirEntrySliceStack 2140 2141 if nil == dirEntrySliceElementToPrepend { 2142 _, dirEntrySlice, moreEntries, err = vS.middlewareReadDirHelper(dirPath, remainingMaxEntries, prevReturned) 2143 if nil != err { 2144 return 2145 } 2146 } else { 2147 if 0 == remainingMaxEntries { 2148 dirEntrySlice = []inode.DirEntry{*dirEntrySliceElementToPrepend} 2149 moreEntries = false 2150 } else { 2151 _, dirEntrySliceToAppend, moreEntries, err = vS.middlewareReadDirHelper(dirPath, remainingMaxEntries, prevReturned) 2152 if nil == err { 2153 dirEntrySlice = make([]inode.DirEntry, 1, 1+len(dirEntrySliceToAppend)) 2154 dirEntrySlice[0] = *dirEntrySliceElementToPrepend 2155 dirEntrySlice = append(dirEntrySlice, dirEntrySliceToAppend...) 2156 } else { 2157 return 2158 } 2159 } 2160 } 2161 2162 dirEntrySliceStackElement = &dirEntrySliceStackElementStruct{ 2163 dirPath: dirPath, 2164 dirEntrySlice: dirEntrySlice, 2165 numConsumed: 0, 2166 moreEntries: moreEntries, 2167 } 2168 2169 dirEntrySliceStack = []*dirEntrySliceStackElementStruct{dirEntrySliceStackElement} 2170 2171 containerEnts = make([]ContainerEntry, 0, len(dirEntrySlice)) 2172 2173 // Now append appropriate ContainerEntry's until exit criteria is reached 2174 2175 for uint64(len(containerEnts)) < maxEntries { 2176 dirEntrySliceStackElement = dirEntrySliceStack[len(dirEntrySliceStack)-1] 2177 2178 if dirEntrySliceStackElement.numConsumed == len(dirEntrySliceStackElement.dirEntrySlice) { 2179 if dirEntrySliceStackElement.moreEntries { 2180 dirPath = dirEntrySliceStackElement.dirPath 2181 dirEntrySlice = dirEntrySliceStackElement.dirEntrySlice 2182 dirEntrySliceElementIndex = len(dirEntrySlice) - 1 2183 dirEntrySliceElement = dirEntrySlice[dirEntrySliceElementIndex] 2184 prevReturned = dirEntrySliceElement.Basename 2185 2186 _, dirEntrySlice, moreEntries, err = vS.middlewareReadDirHelper(dirPath, remainingMaxEntries, prevReturned) 2187 if (nil != err) || (0 == len(dirEntrySlice)) { 2188 // Even though we thought there were moreEntries, there now are not for some reason 2189 2190 if doSingleDirectory { 2191 // Regardless of remaining contents of dirEntrySliceStack, we must be done 2192 2193 err = nil 2194 return 2195 } 2196 2197 // Navigate to parent directory 2198 2199 dirEntrySliceStack = dirEntrySliceStack[:len(dirEntrySliceStack)-1] 2200 continue 2201 } 2202 2203 // Restart this loop on current dirEntrySliceStackElement with new middlewareReadDirHelper() results 2204 2205 dirEntrySliceStackElement.dirEntrySlice = dirEntrySlice 2206 dirEntrySliceStackElement.numConsumed = 0 2207 dirEntrySliceStackElement.moreEntries = moreEntries 2208 2209 continue 2210 } else { 2211 // We've reached the end of this DirInode 2212 2213 if doSingleDirectory { 2214 // Regardless of remaining contents of dirEntrySliceStack, we must be done 2215 2216 err = nil 2217 return 2218 } 2219 2220 // Navigate to parent directory (staying within this Container) 2221 2222 if 1 == len(dirEntrySliceStack) { 2223 // We are at the starting directory 2224 2225 dirPathSplit = strings.Split(dirEntrySliceStackElement.dirPath, "/") 2226 2227 if 1 == len(dirPathSplit) { 2228 // We just finished Container-level directory, so we are done 2229 2230 err = nil 2231 return 2232 } 2233 2234 // Modify dirEntrySliceStackElement to point to parent directory as if we'd just processed the dirEntry of this directory 2235 2236 dirPath = strings.Join(dirPathSplit[:len(dirPathSplit)-1], "/") 2237 2238 if 0 == maxEntries { 2239 remainingMaxEntries = 0 2240 } else { 2241 remainingMaxEntries = maxEntries - uint64(len(containerEnts)) 2242 } 2243 2244 prevReturned = dirPathSplit[len(dirPathSplit)-1] 2245 2246 _, dirEntrySlice, moreEntries, err = vS.middlewareReadDirHelper(dirPath, remainingMaxEntries, prevReturned) 2247 if nil != err { 2248 return 2249 } 2250 2251 dirEntrySliceStackElement.dirPath = dirPath 2252 dirEntrySliceStackElement.dirEntrySlice = dirEntrySlice 2253 dirEntrySliceStackElement.numConsumed = 0 2254 dirEntrySliceStackElement.moreEntries = moreEntries 2255 } else { 2256 // Parent directory already in dirEntrySliceStack... so just pop current ...Element 2257 2258 dirEntrySliceStack = dirEntrySliceStack[:len(dirEntrySliceStack)-1] 2259 } 2260 2261 continue 2262 } 2263 } 2264 2265 // Consume next dirEntrySliceElement 2266 // ...skipping "." and ".." 2267 // ...skipping if <dirPath>/<Basename> <= marker 2268 // ...recursing when encountering DirInode's if !doSingleDirectory 2269 // ...terminating early if either: 2270 // len(*containerEnts) reaches maxEntries 2271 // <dirPath>/<Basename> >= endmarker 2272 // <dirPath>/<Basename> does not start with prefix 2273 2274 dirEntrySlice = dirEntrySliceStackElement.dirEntrySlice 2275 dirEntrySliceElementIndex = dirEntrySliceStackElement.numConsumed 2276 dirEntrySliceElement = dirEntrySlice[dirEntrySliceElementIndex] 2277 2278 dirEntrySliceStackElement.numConsumed++ 2279 2280 if ("." == dirEntrySliceElement.Basename) || (".." == dirEntrySliceElement.Basename) { 2281 continue 2282 } 2283 2284 containerEntryPath = dirEntrySliceStackElement.dirPath + "/" + dirEntrySliceElement.Basename 2285 2286 if ("" != marker) && (strings.Compare(containerEntryPath, markerCanonicalized) <= 0) { 2287 err = nil 2288 return 2289 } 2290 if ("" != endmarker) && (strings.Compare(containerEntryPath, endmarkerCanonicalized) >= 0) { 2291 err = nil 2292 return 2293 } 2294 if ("" != prefix) && !strings.HasPrefix(containerEntryPath, prefixCanonicalized) { 2295 err = nil 2296 return 2297 } 2298 2299 // Ok... so we actually want to append this entry to containerEnts 2300 2301 tryLockBackoffContext = &tryLockBackoffContextStruct{} 2302 2303 Retry: 2304 2305 tryLockBackoffContext.backoff() 2306 2307 dirEntryInodeLock, err = inodeVolumeHandle.AttemptReadLock(dirEntrySliceElement.InodeNumber, dlmCallerID) 2308 if nil != err { 2309 goto Retry 2310 } 2311 2312 dirEntryMetadata, err = inodeVolumeHandle.GetMetadata(dirEntrySliceElement.InodeNumber) 2313 if nil != err { 2314 // Ok... so it must have disappeared... just skip it 2315 2316 err = dirEntryInodeLock.Unlock() 2317 if nil != err { 2318 logger.Fatalf("Failure unlocking a held LockID %s: %v", dirEntryInodeLock.LockID, err) 2319 } 2320 2321 continue 2322 } 2323 2324 containerEntryPathSplit = strings.SplitN(containerEntryPath, "/", 2) 2325 containerEntryBasename = containerEntryPathSplit[1] 2326 2327 containerEntry = ContainerEntry{ 2328 Basename: containerEntryBasename, 2329 FileSize: dirEntryMetadata.Size, 2330 ModificationTime: uint64(dirEntryMetadata.ModificationTime.UnixNano()), 2331 AttrChangeTime: uint64(dirEntryMetadata.AttrChangeTime.UnixNano()), 2332 IsDir: (dirEntrySliceElement.Type == inode.DirType), 2333 NumWrites: dirEntryMetadata.NumWrites, 2334 InodeNumber: uint64(dirEntrySliceElement.InodeNumber), 2335 } 2336 2337 containerEntry.Metadata, err = inodeVolumeHandle.GetStream(dirEntrySliceElement.InodeNumber, MiddlewareStream) 2338 if nil != err { 2339 if blunder.Is(err, blunder.StreamNotFound) { 2340 // No MiddlewareStream... just make it appear empty 2341 2342 containerEntry.Metadata = []byte{} 2343 err = nil 2344 } else { 2345 // Ok... so it must have disappeared... just skip it 2346 2347 err = dirEntryInodeLock.Unlock() 2348 if nil != err { 2349 logger.Fatalf("Failure unlocking a held LockID %s: %v", dirEntryInodeLock.LockID, err) 2350 } 2351 2352 continue 2353 } 2354 } 2355 2356 // We can finally Unlock() this dirEntryInodeLock 2357 2358 err = dirEntryInodeLock.Unlock() 2359 if nil != err { 2360 logger.Fatalf("Failure unlocking a held LockID %s: %v", dirEntryInodeLock.LockID, err) 2361 } 2362 2363 // If we reach here, we get to append this containerEntry to containerEnts 2364 2365 containerEnts = append(containerEnts, containerEntry) 2366 2367 // We must now descend into dirEntryInode descend into it if it's a DirInode and !doSingleDirectory 2368 2369 if !doSingleDirectory && (dirEntrySliceElement.Type == inode.DirType) { 2370 dirPath = dirEntrySliceStackElement.dirPath + "/" + dirEntrySliceElement.Basename 2371 2372 if 0 == maxEntries { 2373 remainingMaxEntries = 0 2374 } else { 2375 remainingMaxEntries = maxEntries - uint64(len(containerEnts)) 2376 } 2377 2378 prevReturned = "" 2379 2380 _, dirEntrySlice, moreEntries, err = vS.middlewareReadDirHelper(dirPath, remainingMaxEntries, prevReturned) 2381 if nil != err { 2382 return 2383 } 2384 2385 dirEntrySliceStackElement = &dirEntrySliceStackElementStruct{ 2386 dirPath: dirPath, 2387 dirEntrySlice: dirEntrySlice, 2388 numConsumed: 0, 2389 moreEntries: moreEntries, 2390 } 2391 2392 dirEntrySliceStack = append(dirEntrySliceStack, dirEntrySliceStackElement) 2393 } 2394 } 2395 2396 // We will only reach here if we exhausted maxEntries before exhausing the tree/list of containerEntry's to append 2397 2398 err = nil 2399 return 2400 } 2401 2402 func (vS *volumeStruct) MiddlewareGetObject(containerObjectPath string, 2403 readRangeIn []ReadRangeIn, readRangeOut *[]inode.ReadPlanStep) ( 2404 response HeadResponse, err error) { 2405 2406 var ( 2407 dirEntryInodeNumber inode.InodeNumber 2408 fileOffset uint64 2409 heldLocks *heldLocksStruct 2410 inodeVolumeHandle inode.VolumeHandle 2411 readPlan []inode.ReadPlanStep 2412 readRangeInIndex int 2413 retryRequired bool 2414 stat Stat 2415 tryLockBackoffContext *tryLockBackoffContextStruct 2416 ) 2417 2418 startTime := time.Now() 2419 defer func() { 2420 var totalReadBytes uint64 2421 for _, step := range *readRangeOut { 2422 totalReadBytes += step.Length 2423 } 2424 2425 globals.MiddlewareGetObjectUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 2426 globals.MiddlewareGetObjectBytes.Add(totalReadBytes) 2427 if err != nil { 2428 globals.MiddlewareGetObjectErrors.Add(1) 2429 } 2430 }() 2431 2432 // Retry until done or failure (starting with ZERO backoff) 2433 2434 tryLockBackoffContext = &tryLockBackoffContextStruct{} 2435 2436 Restart: 2437 2438 // Perform backoff and update for each restart (starting with ZERO backoff of course) 2439 2440 tryLockBackoffContext.backoff() 2441 2442 // Construct fresh heldLocks for this restart 2443 2444 heldLocks = newHeldLocks() 2445 2446 _, dirEntryInodeNumber, _, _, retryRequired, err = 2447 vS.resolvePath( 2448 inode.RootDirInodeNumber, 2449 containerObjectPath, 2450 heldLocks, 2451 resolvePathFollowDirEntrySymlinks| 2452 resolvePathFollowDirSymlinks) 2453 2454 if nil != err { 2455 heldLocks.free() 2456 return 2457 } 2458 2459 if retryRequired { 2460 heldLocks.free() 2461 goto Restart 2462 } 2463 2464 // Now assemble response 2465 2466 stat, err = vS.getstatHelperWhileLocked(dirEntryInodeNumber) 2467 if nil != err { 2468 heldLocks.free() 2469 return 2470 } 2471 2472 response.FileSize = stat[StatSize] 2473 response.ModificationTime = stat[StatMTime] 2474 response.AttrChangeTime = stat[StatCTime] 2475 response.IsDir = (stat[StatFType] == uint64(inode.DirType)) 2476 response.InodeNumber = dirEntryInodeNumber 2477 response.NumWrites = stat[StatNumWrites] 2478 2479 // Swift thinks all directories have a size of 0 (and symlinks as well) 2480 if stat[StatFType] != uint64(inode.FileType) { 2481 response.FileSize = 0 2482 } 2483 2484 response.Metadata, err = vS.inodeVolumeHandle.GetStream(dirEntryInodeNumber, MiddlewareStream) 2485 if nil != err { 2486 if blunder.Is(err, blunder.StreamNotFound) { 2487 response.Metadata = []byte{} 2488 err = nil 2489 } else { 2490 heldLocks.free() 2491 return 2492 } 2493 } 2494 2495 // The only thing left is to construct a read plan and only regular 2496 // files have read plans. If this is not a regular file then we're 2497 // done. 2498 if stat[StatFType] != uint64(inode.FileType) { 2499 heldLocks.free() 2500 return 2501 } 2502 2503 inodeVolumeHandle = vS.inodeVolumeHandle 2504 if len(readRangeIn) == 0 { 2505 // Get ReadPlan for entire file 2506 2507 fileOffset = 0 2508 2509 readPlan, err = inodeVolumeHandle.GetReadPlan(dirEntryInodeNumber, &fileOffset, &response.FileSize) 2510 if nil != err { 2511 heldLocks.free() 2512 return 2513 } 2514 2515 _ = appendReadPlanEntries(readPlan, readRangeOut) 2516 } else { // len(readRangeIn) > 0 2517 // Append each computed range 2518 2519 for readRangeInIndex = range readRangeIn { 2520 readPlan, err = inodeVolumeHandle.GetReadPlan(dirEntryInodeNumber, readRangeIn[readRangeInIndex].Offset, readRangeIn[readRangeInIndex].Len) 2521 if nil != err { 2522 heldLocks.free() 2523 return 2524 } 2525 2526 _ = appendReadPlanEntries(readPlan, readRangeOut) 2527 } 2528 } 2529 2530 heldLocks.free() 2531 2532 err = nil 2533 return 2534 } 2535 2536 func (vS *volumeStruct) MiddlewareHeadResponse(entityPath string) (response HeadResponse, err error) { 2537 var ( 2538 dirEntryInodeNumber inode.InodeNumber 2539 heldLocks *heldLocksStruct 2540 retryRequired bool 2541 stat Stat 2542 tryLockBackoffContext *tryLockBackoffContextStruct 2543 ) 2544 2545 startTime := time.Now() 2546 defer func() { 2547 globals.MiddlewareHeadResponseUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 2548 if err != nil { 2549 globals.MiddlewareHeadResponseErrors.Add(1) 2550 } 2551 }() 2552 2553 // Retry until done or failure (starting with ZERO backoff) 2554 2555 tryLockBackoffContext = &tryLockBackoffContextStruct{} 2556 2557 Restart: 2558 2559 // Perform backoff and update for each restart (starting with ZERO backoff of course) 2560 2561 tryLockBackoffContext.backoff() 2562 2563 // Construct fresh heldLocks for this restart 2564 2565 heldLocks = newHeldLocks() 2566 2567 _, dirEntryInodeNumber, _, _, retryRequired, err = 2568 vS.resolvePath( 2569 inode.RootDirInodeNumber, 2570 entityPath, 2571 heldLocks, 2572 resolvePathFollowDirEntrySymlinks| 2573 resolvePathFollowDirSymlinks) 2574 2575 if nil != err { 2576 heldLocks.free() 2577 return 2578 } 2579 2580 if retryRequired { 2581 heldLocks.free() 2582 goto Restart 2583 } 2584 2585 // Now assemble response 2586 2587 stat, err = vS.getstatHelperWhileLocked(dirEntryInodeNumber) 2588 if nil != err { 2589 heldLocks.free() 2590 return 2591 } 2592 2593 // since resolvePathFollowDirEntrySymlinks is set on the call to 2594 // resolvePath(), above, we'll never see a symlink returned 2595 response.ModificationTime = stat[StatMTime] 2596 response.AttrChangeTime = stat[StatCTime] 2597 response.FileSize = stat[StatSize] 2598 response.IsDir = (stat[StatFType] == uint64(inode.DirType)) 2599 response.InodeNumber = dirEntryInodeNumber 2600 response.NumWrites = stat[StatNumWrites] 2601 2602 // Swift thinks all directories have a size of 0 (and symlinks as well) 2603 if stat[StatFType] != uint64(inode.FileType) { 2604 response.FileSize = 0 2605 } 2606 2607 response.Metadata, err = vS.inodeVolumeHandle.GetStream(dirEntryInodeNumber, MiddlewareStream) 2608 if nil != err { 2609 heldLocks.free() 2610 response.Metadata = []byte{} 2611 // If someone makes a directory or file via SMB/FUSE and then 2612 // HEADs it via HTTP, we'll see this error. We treat it as 2613 // though there is no metadata. The middleware is equipped to 2614 // handle this case. 2615 if blunder.Is(err, blunder.StreamNotFound) { 2616 err = nil 2617 } 2618 return 2619 } 2620 2621 heldLocks.free() 2622 return 2623 } 2624 2625 func (vS *volumeStruct) MiddlewarePost(parentDir string, baseName string, newMetaData []byte, oldMetaData []byte) (err error) { 2626 var ( 2627 dirEntryInodeNumber inode.InodeNumber 2628 existingStreamData []byte 2629 heldLocks *heldLocksStruct 2630 retryRequired bool 2631 tryLockBackoffContext *tryLockBackoffContextStruct 2632 ) 2633 2634 startTime := time.Now() 2635 defer func() { 2636 globals.MiddlewarePostUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 2637 globals.MiddlewarePostBytes.Add(uint64(len(newMetaData))) 2638 if err != nil { 2639 globals.MiddlewarePostErrors.Add(1) 2640 } 2641 }() 2642 2643 // Retry until done or failure (starting with ZERO backoff) 2644 2645 tryLockBackoffContext = &tryLockBackoffContextStruct{} 2646 2647 Restart: 2648 2649 // Perform backoff and update for each restart (starting with ZERO backoff of course) 2650 2651 tryLockBackoffContext.backoff() 2652 2653 // Construct fresh heldLocks for this restart 2654 2655 heldLocks = newHeldLocks() 2656 2657 _, dirEntryInodeNumber, _, _, retryRequired, err = 2658 vS.resolvePath( 2659 inode.RootDirInodeNumber, 2660 parentDir+"/"+baseName, 2661 heldLocks, 2662 resolvePathFollowDirEntrySymlinks| 2663 resolvePathFollowDirSymlinks| 2664 resolvePathCreateMissingPathElements| 2665 resolvePathRequireExclusiveLockOnDirEntryInode) 2666 2667 if nil != err { 2668 heldLocks.free() 2669 return 2670 } 2671 2672 if retryRequired { 2673 heldLocks.free() 2674 goto Restart 2675 } 2676 2677 // Now apply MiddlewareStream update 2678 2679 // Compare oldMetaData to existing existingStreamData to make sure that the HTTP metadata has not changed. 2680 // If it has changed, then return an error since middleware has to handle it. 2681 2682 existingStreamData, err = vS.inodeVolumeHandle.GetStream(dirEntryInodeNumber, MiddlewareStream) 2683 if nil != err { 2684 if blunder.Is(err, blunder.StreamNotFound) { 2685 err = nil 2686 existingStreamData = make([]byte, 0) 2687 } else { 2688 heldLocks.free() 2689 return 2690 } 2691 } 2692 2693 // Verify that the oldMetaData is the same as the one we think we are changing. 2694 2695 if !bytes.Equal(existingStreamData, oldMetaData) { 2696 heldLocks.free() 2697 err = blunder.NewError(blunder.TryAgainError, "MiddlewarePost(): MetaData different - existingStreamData: %v OldMetaData: %v", existingStreamData, oldMetaData) 2698 return 2699 } 2700 2701 // Change looks okay so make it. 2702 2703 err = vS.inodeVolumeHandle.PutStream(dirEntryInodeNumber, MiddlewareStream, newMetaData) 2704 if nil != err { 2705 heldLocks.free() 2706 return 2707 } 2708 2709 // PutStream() implicitly flushed... so, if it was a FileInode, we don't need to track it anymore 2710 2711 vS.untrackInFlightFileInodeData(dirEntryInodeNumber, false) 2712 2713 heldLocks.free() 2714 return 2715 } 2716 2717 func (vS *volumeStruct) MiddlewarePutComplete(vContainerName string, vObjectPath string, pObjectPaths []string, pObjectLengths []uint64, pObjectMetadata []byte) (mtime uint64, ctime uint64, fileInodeNumber inode.InodeNumber, numWrites uint64, err error) { 2718 var ( 2719 containerName string 2720 dirInodeNumber inode.InodeNumber 2721 dirEntryInodeNumber inode.InodeNumber 2722 dirEntryBasename string 2723 dirEntryInodeType inode.InodeType 2724 fileOffset uint64 2725 heldLocks *heldLocksStruct 2726 inodeVolumeHandle inode.VolumeHandle = vS.inodeVolumeHandle 2727 numPObjects int 2728 objectName string 2729 pObjectIndex int 2730 retryRequired bool 2731 stat Stat 2732 tryLockBackoffContext *tryLockBackoffContextStruct 2733 ) 2734 2735 startTime := time.Now() 2736 defer func() { 2737 globals.MiddlewarePutCompleteUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 2738 if err != nil { 2739 globals.MiddlewarePutCompleteErrors.Add(1) 2740 } 2741 }() 2742 2743 // Validate (pObjectPaths,pObjectLengths) args 2744 2745 numPObjects = len(pObjectPaths) 2746 2747 if numPObjects != len(pObjectLengths) { 2748 blunder.NewError(blunder.InvalidArgError, "MiddlewarePutComplete() expects len(pObjectPaths) == len(pObjectLengths)") 2749 return 2750 } 2751 2752 // Retry until done or failure (starting with ZERO backoff) 2753 2754 tryLockBackoffContext = &tryLockBackoffContextStruct{} 2755 2756 Restart: 2757 2758 // Perform backoff and update for each restart (starting with ZERO backoff of course) 2759 2760 tryLockBackoffContext.backoff() 2761 2762 // Construct fresh heldLocks for this restart 2763 2764 heldLocks = newHeldLocks() 2765 2766 dirInodeNumber, dirEntryInodeNumber, dirEntryBasename, dirEntryInodeType, retryRequired, err = 2767 vS.resolvePath( 2768 inode.RootDirInodeNumber, 2769 vContainerName+"/"+vObjectPath, 2770 heldLocks, 2771 resolvePathFollowDirEntrySymlinks| 2772 resolvePathFollowDirSymlinks| 2773 resolvePathCreateMissingPathElements| 2774 resolvePathRequireExclusiveLockOnDirInode| 2775 resolvePathRequireExclusiveLockOnDirEntryInode) 2776 if nil != err { 2777 heldLocks.free() 2778 return 2779 } 2780 if retryRequired { 2781 heldLocks.free() 2782 goto Restart 2783 } 2784 2785 // The semantics of PUT mean that the existing object is discarded; with 2786 // a file we can just overwrite it, but symlinks or directories must be 2787 // removed (if possible). 2788 if dirEntryInodeType != inode.FileType { 2789 2790 if dirEntryInodeType == inode.DirType { 2791 2792 // try to unlink the directory (rmdir flushes the inodes) 2793 err = vS.rmdirActual(dirInodeNumber, dirEntryBasename, dirEntryInodeNumber) 2794 if err != nil { 2795 // the directory was probably not empty 2796 heldLocks.free() 2797 return 2798 2799 } 2800 2801 } else { 2802 // unlink the symlink (unlink flushes the inodes) 2803 err = vS.unlinkActual(dirInodeNumber, dirEntryBasename, dirEntryInodeNumber) 2804 if err != nil { 2805 2806 // ReadOnlyError is my best guess for the failure 2807 err = blunder.NewError(blunder.ReadOnlyError, 2808 "MiddlewareMkdir(): vol '%s' failed to unlink '%s': %v", 2809 vS.volumeName, vContainerName+"/"+vObjectPath, err) 2810 heldLocks.free() 2811 return 2812 } 2813 } 2814 2815 // let resolvePath() create the file 2816 dirInodeNumber, dirEntryInodeNumber, dirEntryBasename, dirEntryInodeType, retryRequired, err = 2817 vS.resolvePath( 2818 inode.RootDirInodeNumber, 2819 vContainerName+"/"+vObjectPath, 2820 heldLocks, 2821 resolvePathFollowDirSymlinks| 2822 resolvePathCreateMissingPathElements| 2823 resolvePathDirEntryInodeMustBeFile| 2824 resolvePathRequireExclusiveLockOnDirInode| 2825 resolvePathRequireExclusiveLockOnDirEntryInode) 2826 if nil != err { 2827 heldLocks.free() 2828 return 2829 } 2830 if retryRequired { 2831 heldLocks.free() 2832 goto Restart 2833 } 2834 } 2835 2836 // Apply (pObjectPaths,pObjectLengths) to (erased) FileInode 2837 2838 fileOffset = 0 2839 2840 for pObjectIndex = 0; pObjectIndex < numPObjects; pObjectIndex++ { 2841 _, containerName, objectName, err = utils.PathToAcctContObj(pObjectPaths[pObjectIndex]) 2842 if nil != err { 2843 heldLocks.free() 2844 logger.DebugfIDWithError(internalDebug, err, "MiddlewarePutComplete(): failed utils.PathToAcctContObj(\"%s\") for dirEntryInodeNumber 0x%016X", pObjectPaths[pObjectIndex], dirEntryInodeNumber) 2845 return 2846 } 2847 2848 err = inodeVolumeHandle.Wrote( 2849 dirEntryInodeNumber, 2850 containerName, 2851 objectName, 2852 []uint64{fileOffset}, 2853 []uint64{0}, 2854 []uint64{pObjectLengths[pObjectIndex]}, 2855 pObjectIndex > 0) // Initial pObjectIndex == 0 case will implicitly SetSize(,0) 2856 if nil != err { 2857 heldLocks.free() 2858 logger.DebugfIDWithError(internalDebug, err, "MiddlewarePutComplete(): failed inode.Wrote() for dirEntryInodeNumber 0x%016X", dirEntryInodeNumber) 2859 return 2860 } 2861 2862 fileOffset += pObjectLengths[pObjectIndex] 2863 } 2864 2865 // Apply pObjectMetadata to FileInode (this will flush it as well) 2866 2867 err = inodeVolumeHandle.PutStream(dirEntryInodeNumber, MiddlewareStream, pObjectMetadata) 2868 if err != nil { 2869 heldLocks.free() 2870 logger.DebugfIDWithError(internalDebug, err, "MiddlewarePutComplete(): failed PutStream() for dirEntryInodeNumber 0x%016X (pObjectMetadata: %v)", dirEntryInodeNumber, pObjectMetadata) 2871 return 2872 } 2873 2874 stat, err = vS.getstatHelperWhileLocked(dirEntryInodeNumber) 2875 if nil != err { 2876 heldLocks.free() 2877 return 2878 } 2879 2880 mtime = stat[StatMTime] 2881 ctime = stat[StatCTime] 2882 fileInodeNumber = dirEntryInodeNumber 2883 numWrites = stat[StatNumWrites] 2884 2885 heldLocks.free() 2886 return 2887 } 2888 2889 func (vS *volumeStruct) MiddlewareMkdir(vContainerName string, vObjectPath string, metadata []byte) (mtime uint64, ctime uint64, inodeNumber inode.InodeNumber, numWrites uint64, err error) { 2890 var ( 2891 dirInodeNumber inode.InodeNumber 2892 dirEntryInodeNumber inode.InodeNumber 2893 dirEntryBasename string 2894 dirEntryInodeType inode.InodeType 2895 heldLocks *heldLocksStruct 2896 retryRequired bool 2897 stat Stat 2898 tryLockBackoffContext *tryLockBackoffContextStruct 2899 ) 2900 2901 startTime := time.Now() 2902 defer func() { 2903 globals.MiddlewareMkdirUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 2904 if err != nil { 2905 globals.MiddlewareMkdirErrors.Add(1) 2906 } 2907 }() 2908 2909 // Retry until done or failure (starting with ZERO backoff) 2910 2911 tryLockBackoffContext = &tryLockBackoffContextStruct{} 2912 2913 Restart: 2914 2915 // Perform backoff and update for each restart (starting with ZERO backoff of course) 2916 2917 tryLockBackoffContext.backoff() 2918 2919 // Construct fresh heldLocks for this restart 2920 2921 heldLocks = newHeldLocks() 2922 2923 // Resolve the object, locking it and its parent directory exclusive 2924 dirInodeNumber, dirEntryInodeNumber, dirEntryBasename, dirEntryInodeType, retryRequired, err = 2925 vS.resolvePath( 2926 inode.RootDirInodeNumber, 2927 vContainerName+"/"+vObjectPath, 2928 heldLocks, 2929 resolvePathFollowDirSymlinks| 2930 resolvePathCreateMissingPathElements| 2931 resolvePathRequireExclusiveLockOnDirInode| 2932 resolvePathRequireExclusiveLockOnDirEntryInode) 2933 if nil != err { 2934 heldLocks.free() 2935 return 2936 } 2937 if retryRequired { 2938 heldLocks.free() 2939 goto Restart 2940 } 2941 2942 // The semantics of PUT for a directory object require that an existing 2943 // file or symlink be discarded and be replaced with a directory (an 2944 // existing directory is fine; it just has its headers overwritten). 2945 if dirEntryInodeType != inode.DirType { 2946 2947 // unlink the file or symlink (unlink flushes the inodes) 2948 err = vS.unlinkActual(dirInodeNumber, dirEntryBasename, dirEntryInodeNumber) 2949 if err != nil { 2950 2951 // ReadOnlyError is my best guess for the failure 2952 err = blunder.NewError(blunder.ReadOnlyError, 2953 "MiddlewareMkdir(): vol '%s' failed to unlink '%s': %v", 2954 vS.volumeName, vContainerName+"/"+vObjectPath, err) 2955 heldLocks.free() 2956 return 2957 } 2958 2959 // let resolvePath() make the directory 2960 dirInodeNumber, dirEntryInodeNumber, dirEntryBasename, dirEntryInodeType, retryRequired, err = 2961 vS.resolvePath( 2962 inode.RootDirInodeNumber, 2963 vContainerName+"/"+vObjectPath, 2964 heldLocks, 2965 resolvePathFollowDirSymlinks| 2966 resolvePathCreateMissingPathElements| 2967 resolvePathDirEntryInodeMustBeDirectory| 2968 resolvePathRequireExclusiveLockOnDirInode| 2969 resolvePathRequireExclusiveLockOnDirEntryInode) 2970 if nil != err { 2971 heldLocks.free() 2972 return 2973 } 2974 if retryRequired { 2975 heldLocks.free() 2976 goto Restart 2977 } 2978 } 2979 2980 err = vS.inodeVolumeHandle.PutStream(dirEntryInodeNumber, MiddlewareStream, metadata) 2981 if err != nil { 2982 heldLocks.free() 2983 logger.DebugfIDWithError(internalDebug, err, "MiddlewareHeadResponse(): failed PutStream() for for dirEntryInodeNumber 0x%016X (pObjectMetadata: %v)", dirEntryInodeNumber, metadata) 2984 return 2985 } 2986 2987 stat, err = vS.getstatHelperWhileLocked(dirEntryInodeNumber) 2988 if nil != err { 2989 heldLocks.free() 2990 return 2991 } 2992 2993 mtime = stat[StatMTime] 2994 ctime = stat[StatCTime] 2995 inodeNumber = dirEntryInodeNumber 2996 numWrites = stat[StatNumWrites] 2997 2998 heldLocks.free() 2999 return 3000 } 3001 3002 func (vS *volumeStruct) MiddlewarePutContainer(containerName string, oldMetadata []byte, newMetadata []byte) (err error) { 3003 var ( 3004 containerInodeLock *dlm.RWLockStruct 3005 containerInodeNumber inode.InodeNumber 3006 existingMetadata []byte 3007 newDirInodeLock *dlm.RWLockStruct 3008 newDirInodeNumber inode.InodeNumber 3009 ) 3010 3011 startTime := time.Now() 3012 defer func() { 3013 globals.MiddlewarePutContainerUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3014 globals.MiddlewarePutContainerBytes.Add(uint64(len(newMetadata))) 3015 if err != nil { 3016 globals.MiddlewarePutContainerErrors.Add(1) 3017 } 3018 }() 3019 3020 vS.jobRWMutex.RLock() 3021 defer vS.jobRWMutex.RUnlock() 3022 3023 // Yes, it's a heavy lock to hold on the root inode. However, we 3024 // might need to add a new directory entry there, so there's not 3025 // much else we can do. 3026 rootInodeLock, err := vS.inodeVolumeHandle.GetWriteLock(inode.RootDirInodeNumber, nil) 3027 if nil != err { 3028 return 3029 } 3030 defer rootInodeLock.Unlock() 3031 3032 containerInodeNumber, err = vS.inodeVolumeHandle.Lookup(inode.RootDirInodeNumber, containerName) 3033 if err != nil && blunder.IsNot(err, blunder.NotFoundError) { 3034 return 3035 } else if err != nil { 3036 // No such container, so we create it 3037 err = validateBaseName(containerName) 3038 if err != nil { 3039 return 3040 } 3041 3042 newDirInodeNumber, err = vS.inodeVolumeHandle.CreateDir(inode.PosixModePerm, 0, 0) 3043 if err != nil { 3044 logger.ErrorWithError(err) 3045 return 3046 } 3047 3048 newDirInodeLock, err = vS.inodeVolumeHandle.GetWriteLock(newDirInodeNumber, nil) 3049 defer newDirInodeLock.Unlock() 3050 3051 err = vS.inodeVolumeHandle.PutStream(newDirInodeNumber, MiddlewareStream, newMetadata) 3052 if err != nil { 3053 logger.ErrorWithError(err) 3054 return 3055 } 3056 3057 err = vS.inodeVolumeHandle.Link(inode.RootDirInodeNumber, containerName, newDirInodeNumber, false) 3058 3059 return 3060 } 3061 3062 containerInodeLock, err = vS.inodeVolumeHandle.GetWriteLock(containerInodeNumber, nil) 3063 if err != nil { 3064 return 3065 } 3066 defer containerInodeLock.Unlock() 3067 3068 // Existing container: just update the metadata 3069 existingMetadata, err = vS.inodeVolumeHandle.GetStream(containerInodeNumber, MiddlewareStream) 3070 3071 // GetStream() will return an error if there is no "middleware" stream 3072 if err != nil && blunder.IsNot(err, blunder.StreamNotFound) { 3073 return 3074 } else if err != nil { 3075 existingMetadata = []byte{} 3076 } 3077 3078 // Only change it if the caller sent the current value 3079 if !bytes.Equal(existingMetadata, oldMetadata) { 3080 err = blunder.NewError(blunder.TryAgainError, "Metadata differs - actual: %v request: %v", existingMetadata, oldMetadata) 3081 return 3082 } 3083 err = vS.inodeVolumeHandle.PutStream(containerInodeNumber, MiddlewareStream, newMetadata) 3084 3085 return 3086 } 3087 3088 func (vS *volumeStruct) Mkdir(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, basename string, filePerm inode.InodeMode) (newDirInodeNumber inode.InodeNumber, err error) { 3089 startTime := time.Now() 3090 defer func() { 3091 globals.MkdirUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3092 if err != nil { 3093 globals.MkdirErrors.Add(1) 3094 } 3095 }() 3096 3097 vS.jobRWMutex.RLock() 3098 defer vS.jobRWMutex.RUnlock() 3099 3100 // Make sure the file basename is not too long 3101 err = validateBaseName(basename) 3102 if err != nil { 3103 return 0, err 3104 } 3105 3106 newDirInodeNumber, err = vS.inodeVolumeHandle.CreateDir(filePerm, userID, groupID) 3107 if err != nil { 3108 logger.ErrorWithError(err) 3109 return 0, err 3110 } 3111 3112 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 3113 if err != nil { 3114 return 3115 } 3116 err = inodeLock.WriteLock() 3117 if err != nil { 3118 return 3119 } 3120 defer inodeLock.Unlock() 3121 3122 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 3123 inode.NoOverride) { 3124 3125 destroyErr := vS.inodeVolumeHandle.Destroy(newDirInodeNumber) 3126 if destroyErr != nil { 3127 logger.WarnfWithError(destroyErr, "couldn't destroy inode %v after failed Access(F_OK) in fs.Mkdir", newDirInodeNumber) 3128 } 3129 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3130 return 0, err 3131 } 3132 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, 3133 inode.NoOverride) { 3134 3135 destroyErr := vS.inodeVolumeHandle.Destroy(newDirInodeNumber) 3136 if destroyErr != nil { 3137 logger.WarnfWithError(destroyErr, "couldn't destroy inode %v after failed Access(W_OK|X_OK) in fs.Mkdir", newDirInodeNumber) 3138 } 3139 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3140 return 0, err 3141 } 3142 3143 err = vS.inodeVolumeHandle.Link(inodeNumber, basename, newDirInodeNumber, false) 3144 if err != nil { 3145 destroyErr := vS.inodeVolumeHandle.Destroy(newDirInodeNumber) 3146 if destroyErr != nil { 3147 logger.WarnfWithError(destroyErr, "couldn't destroy inode %v after failed Link() in fs.Mkdir", newDirInodeNumber) 3148 } 3149 return 0, err 3150 } 3151 3152 return newDirInodeNumber, nil 3153 } 3154 3155 func (vS *volumeStruct) RemoveXAttr(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, streamName string) (err error) { 3156 startTime := time.Now() 3157 defer func() { 3158 globals.RemoveXAttrUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3159 if err != nil { 3160 globals.RemoveXAttrErrors.Add(1) 3161 } 3162 }() 3163 3164 vS.jobRWMutex.RLock() 3165 defer vS.jobRWMutex.RUnlock() 3166 3167 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 3168 if err != nil { 3169 return 3170 } 3171 err = inodeLock.WriteLock() 3172 if err != nil { 3173 return 3174 } 3175 defer inodeLock.Unlock() 3176 3177 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 3178 inode.NoOverride) { 3179 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3180 return 3181 } 3182 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK, 3183 inode.OwnerOverride) { 3184 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3185 return 3186 } 3187 3188 err = vS.inodeVolumeHandle.DeleteStream(inodeNumber, streamName) 3189 if err != nil { 3190 logger.ErrorfWithError(err, "Failed to delete XAttr %v of inode %v", streamName, inodeNumber) 3191 } 3192 3193 vS.untrackInFlightFileInodeData(inodeNumber, false) 3194 3195 return 3196 } 3197 3198 func (vS *volumeStruct) Rename(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, srcDirInodeNumber inode.InodeNumber, srcBasename string, dstDirInodeNumber inode.InodeNumber, dstBasename string) (err error) { 3199 var ( 3200 dirEntryBasename string 3201 dirEntryInodeNumber inode.InodeNumber 3202 dirInodeNumber inode.InodeNumber 3203 heldLocks *heldLocksStruct 3204 retryRequired bool 3205 tryLockBackoffContext *tryLockBackoffContextStruct 3206 ) 3207 3208 startTime := time.Now() 3209 defer func() { 3210 globals.RenameUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3211 if err != nil { 3212 globals.RenameErrors.Add(1) 3213 } 3214 }() 3215 3216 vS.jobRWMutex.RLock() 3217 defer vS.jobRWMutex.RUnlock() 3218 3219 err = validateBaseName(srcBasename) 3220 if nil != err { 3221 return 3222 } 3223 3224 err = validateBaseName(dstBasename) 3225 if nil != err { 3226 return 3227 } 3228 3229 // Retry until done or failure (starting with ZERO backoff) 3230 3231 tryLockBackoffContext = &tryLockBackoffContextStruct{} 3232 3233 Restart: 3234 3235 // Perform backoff and update for each restart (starting with ZERO backoff of course) 3236 3237 tryLockBackoffContext.backoff() 3238 3239 // Construct fresh heldLocks for this restart 3240 3241 heldLocks = newHeldLocks() 3242 3243 // Acquire WriteLock on {srcDirInodeNumber,srcBasename} & perform Access Check 3244 3245 dirInodeNumber, _, dirEntryBasename, _, retryRequired, err = 3246 vS.resolvePath( 3247 srcDirInodeNumber, 3248 srcBasename, 3249 heldLocks, 3250 resolvePathRequireExclusiveLockOnDirEntryInode| 3251 resolvePathRequireExclusiveLockOnDirInode) 3252 3253 if nil != err { 3254 heldLocks.free() 3255 err = blunder.AddError(err, blunder.NotFoundError) 3256 return 3257 } 3258 3259 if retryRequired { 3260 heldLocks.free() 3261 goto Restart 3262 } 3263 3264 if (dirInodeNumber != srcDirInodeNumber) || (dirEntryBasename != srcBasename) { 3265 heldLocks.free() 3266 err = blunder.NewError(blunder.InvalidArgError, "EINVAL") 3267 return 3268 } 3269 3270 if !vS.inodeVolumeHandle.Access(srcDirInodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, inode.NoOverride) { 3271 heldLocks.free() 3272 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3273 return 3274 } 3275 // Acquire WriteLock on dstDirInodeNumber & perform Access Check 3276 3277 _, dirEntryInodeNumber, _, _, retryRequired, err = 3278 vS.resolvePath( 3279 dstDirInodeNumber, 3280 ".", 3281 heldLocks, 3282 resolvePathDirEntryInodeMustBeDirectory| 3283 resolvePathRequireExclusiveLockOnDirEntryInode) 3284 3285 if nil != err { 3286 heldLocks.free() 3287 err = blunder.AddError(err, blunder.NotFoundError) 3288 return 3289 } 3290 3291 if retryRequired { 3292 heldLocks.free() 3293 goto Restart 3294 } 3295 3296 if dirEntryInodeNumber != dstDirInodeNumber { 3297 heldLocks.free() 3298 err = blunder.NewError(blunder.InvalidArgError, "EINVAL") 3299 return 3300 } 3301 3302 if !vS.inodeVolumeHandle.Access(dstDirInodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, inode.NoOverride) { 3303 heldLocks.free() 3304 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3305 return 3306 } 3307 3308 // Acquire WriteLock on dstBasename if it exists 3309 3310 dirInodeNumber, _, dirEntryBasename, _, retryRequired, err = 3311 vS.resolvePath( 3312 dstDirInodeNumber, 3313 dstBasename, 3314 heldLocks, 3315 resolvePathRequireExclusiveLockOnDirEntryInode) 3316 3317 if nil == err { 3318 if retryRequired { 3319 heldLocks.free() 3320 goto Restart 3321 } 3322 3323 if (dirInodeNumber != dstDirInodeNumber) || (dirEntryBasename != dstBasename) { 3324 heldLocks.free() 3325 err = blunder.NewError(blunder.InvalidArgError, "EINVAL") 3326 return 3327 } 3328 } else { 3329 // This is actually OK... it means the target path of the Rename() isn't being potentially replaced 3330 } 3331 3332 // Locks held & Access Checks succeeded... time to do the Move 3333 3334 err = vS.inodeVolumeHandle.Move(srcDirInodeNumber, srcBasename, dstDirInodeNumber, dstBasename) 3335 3336 heldLocks.free() 3337 3338 return // err returned from inode.Move() suffices here 3339 } 3340 3341 func (vS *volumeStruct) Read(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, offset uint64, length uint64, profiler *utils.Profiler) (buf []byte, err error) { 3342 startTime := time.Now() 3343 defer func() { 3344 globals.ReadUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3345 globals.ReadBytes.Add(uint64(len(buf))) 3346 if err != nil { 3347 globals.ReadErrors.Add(1) 3348 } 3349 }() 3350 3351 vS.jobRWMutex.RLock() 3352 defer vS.jobRWMutex.RUnlock() 3353 3354 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 3355 if err != nil { 3356 return 3357 } 3358 err = inodeLock.ReadLock() 3359 if err != nil { 3360 return 3361 } 3362 defer inodeLock.Unlock() 3363 3364 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 3365 inode.NoOverride) { 3366 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3367 return 3368 } 3369 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.R_OK, 3370 inode.OwnerOverride) { 3371 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3372 return 3373 } 3374 3375 inodeType, err := vS.inodeVolumeHandle.GetType(inodeNumber) 3376 if err != nil { 3377 logger.ErrorfWithError(err, "couldn't get type for inode %v", inodeNumber) 3378 return buf, err 3379 } 3380 // Make sure the inode number is for a file inode 3381 if inodeType != inode.FileType { 3382 err = fmt.Errorf("%s: expected inode %v to be a file inode, got %v", utils.GetFnName(), inodeNumber, inodeType) 3383 logger.ErrorWithError(err) 3384 return buf, blunder.AddError(err, blunder.NotFileError) 3385 } 3386 3387 profiler.AddEventNow("before inode.Read()") 3388 buf, err = vS.inodeVolumeHandle.Read(inodeNumber, offset, length, profiler) 3389 profiler.AddEventNow("after inode.Read()") 3390 if uint64(len(buf)) > length { 3391 err = fmt.Errorf("%s: Buf length %v is greater than supplied length %v", utils.GetFnName(), uint64(len(buf)), length) 3392 logger.ErrorWithError(err) 3393 return buf, blunder.AddError(err, blunder.IOError) 3394 } 3395 3396 return buf, err 3397 } 3398 3399 func (vS *volumeStruct) readdirHelper(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, maxEntries uint64, prevReturned ...interface{}) (dirEntries []inode.DirEntry, statEntries []Stat, numEntries uint64, areMoreEntries bool, err error) { 3400 var ( 3401 dirEntryIndex uint64 3402 dlmCallerID dlm.CallerID 3403 inodeLock *dlm.RWLockStruct 3404 inodeVolumeHandle inode.VolumeHandle 3405 internalErr error 3406 tryLockBackoffContext *tryLockBackoffContextStruct 3407 ) 3408 3409 vS.jobRWMutex.RLock() 3410 defer vS.jobRWMutex.RUnlock() 3411 3412 dlmCallerID = dlm.GenerateCallerID() 3413 inodeVolumeHandle = vS.inodeVolumeHandle 3414 3415 tryLockBackoffContext = &tryLockBackoffContextStruct{} 3416 3417 Restart: 3418 3419 tryLockBackoffContext.backoff() 3420 3421 inodeLock, err = inodeVolumeHandle.AttemptReadLock(inodeNumber, dlmCallerID) 3422 if nil != err { 3423 goto Restart 3424 } 3425 3426 if !inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, inode.NoOverride) { 3427 internalErr = inodeLock.Unlock() 3428 if nil != internalErr { 3429 logger.Fatalf("Failure unlocking a held LockID %s: %v", inodeLock.LockID, internalErr) 3430 } 3431 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3432 return 3433 } 3434 if !inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.R_OK, inode.OwnerOverride) { 3435 internalErr = inodeLock.Unlock() 3436 if nil != internalErr { 3437 logger.Fatalf("Failure unlocking a held LockID %s: %v", inodeLock.LockID, internalErr) 3438 } 3439 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3440 return 3441 } 3442 3443 dirEntries, areMoreEntries, err = inodeVolumeHandle.ReadDir(inodeNumber, maxEntries, 0, prevReturned...) 3444 3445 internalErr = inodeLock.Unlock() 3446 if nil != internalErr { 3447 logger.Fatalf("Failure unlocking a held LockID %s: %v", inodeLock.LockID, internalErr) 3448 } 3449 3450 if nil != err { 3451 return 3452 } 3453 3454 // Now go back and fill in (dirEntries.Type and) statEntries 3455 3456 numEntries = uint64(len(dirEntries)) 3457 3458 statEntries = make([]Stat, numEntries, numEntries) 3459 3460 for dirEntryIndex = 0; dirEntryIndex < numEntries; dirEntryIndex++ { 3461 inodeLock, err = inodeVolumeHandle.AttemptReadLock(dirEntries[dirEntryIndex].InodeNumber, dlmCallerID) 3462 if nil != err { 3463 goto Restart 3464 } 3465 3466 statEntries[dirEntryIndex], err = vS.getstatHelperWhileLocked(dirEntries[dirEntryIndex].InodeNumber) 3467 3468 internalErr = inodeLock.Unlock() 3469 if nil != internalErr { 3470 logger.Fatalf("Failure unlocking a held LockID %s: %v", inodeLock.LockID, internalErr) 3471 } 3472 3473 if nil == err { 3474 dirEntries[dirEntryIndex].Type = inode.InodeType(statEntries[dirEntryIndex][StatFType]) 3475 } else { 3476 logger.ErrorfWithError(err, "fs.readdirHelper(,,,inodeNumber:0x%016X,,...) couldn't `stat` %s:0x%016X... defaulting .Type to inode.DirType", inodeNumber, dirEntries[dirEntryIndex].Basename, dirEntries[dirEntryIndex].InodeNumber) 3477 dirEntries[dirEntryIndex].Type = inode.DirType 3478 err = nil 3479 } 3480 } 3481 3482 return 3483 } 3484 3485 func (vS *volumeStruct) Readdir(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, maxEntries uint64, prevReturned ...interface{}) (entries []inode.DirEntry, numEntries uint64, areMoreEntries bool, err error) { 3486 startTime := time.Now() 3487 defer func() { 3488 globals.ReaddirUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3489 globals.ReaddirEntries.Add(uint64(len(entries))) 3490 if err != nil { 3491 globals.ReaddirErrors.Add(1) 3492 } 3493 }() 3494 3495 entries, _, numEntries, areMoreEntries, err = vS.readdirHelper(userID, groupID, otherGroupIDs, inodeNumber, maxEntries, prevReturned...) 3496 3497 return 3498 } 3499 3500 func (vS *volumeStruct) ReaddirPlus(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, maxEntries uint64, prevReturned ...interface{}) (dirEntries []inode.DirEntry, statEntries []Stat, numEntries uint64, areMoreEntries bool, err error) { 3501 startTime := time.Now() 3502 defer func() { 3503 globals.ReaddirPlusUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3504 globals.ReaddirPlusBytes.Add(uint64(len(dirEntries))) 3505 if err != nil { 3506 globals.ReaddirPlusErrors.Add(1) 3507 } 3508 }() 3509 3510 dirEntries, statEntries, numEntries, areMoreEntries, err = vS.readdirHelper(userID, groupID, otherGroupIDs, inodeNumber, maxEntries, prevReturned...) 3511 3512 return 3513 } 3514 3515 func (vS *volumeStruct) Readsymlink(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber) (target string, err error) { 3516 startTime := time.Now() 3517 defer func() { 3518 globals.ReadsymlinkUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3519 if err != nil { 3520 globals.ReadsymlinkErrors.Add(1) 3521 } 3522 }() 3523 3524 vS.jobRWMutex.RLock() 3525 defer vS.jobRWMutex.RUnlock() 3526 3527 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 3528 if err != nil { 3529 return 3530 } 3531 err = inodeLock.ReadLock() 3532 if err != nil { 3533 return 3534 } 3535 defer inodeLock.Unlock() 3536 3537 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 3538 inode.NoOverride) { 3539 3540 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3541 return 3542 } 3543 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.R_OK, 3544 inode.NoOverride) { 3545 3546 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3547 return 3548 } 3549 3550 target, err = vS.inodeVolumeHandle.GetSymlink(inodeNumber) 3551 3552 return target, err 3553 } 3554 3555 func (vS *volumeStruct) Resize(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, newSize uint64) (err error) { 3556 startTime := time.Now() 3557 defer func() { 3558 globals.ResizeUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3559 if err != nil { 3560 globals.ResizeErrors.Add(1) 3561 } 3562 }() 3563 3564 vS.jobRWMutex.RLock() 3565 defer vS.jobRWMutex.RUnlock() 3566 3567 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 3568 if err != nil { 3569 return 3570 } 3571 err = inodeLock.WriteLock() 3572 if err != nil { 3573 return 3574 } 3575 defer inodeLock.Unlock() 3576 3577 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 3578 inode.NoOverride) { 3579 3580 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3581 return 3582 } 3583 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK, 3584 inode.OwnerOverride) { 3585 3586 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3587 return 3588 } 3589 3590 err = vS.inodeVolumeHandle.SetSize(inodeNumber, newSize) 3591 vS.untrackInFlightFileInodeData(inodeNumber, false) 3592 3593 return err 3594 } 3595 3596 func (vS *volumeStruct) Rmdir(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, basename string) (err error) { 3597 startTime := time.Now() 3598 defer func() { 3599 globals.RmdirUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3600 if err != nil { 3601 globals.RmdirErrors.Add(1) 3602 } 3603 }() 3604 3605 vS.jobRWMutex.RLock() 3606 defer vS.jobRWMutex.RUnlock() 3607 3608 callerID := dlm.GenerateCallerID() 3609 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, callerID) 3610 if err != nil { 3611 return 3612 } 3613 err = inodeLock.WriteLock() 3614 if err != nil { 3615 return 3616 } 3617 defer inodeLock.Unlock() 3618 3619 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 3620 inode.NoOverride) { 3621 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3622 return 3623 } 3624 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, 3625 inode.NoOverride) { 3626 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3627 return 3628 } 3629 3630 basenameInodeNumber, err := vS.inodeVolumeHandle.Lookup(inodeNumber, basename) 3631 if nil != err { 3632 return 3633 } 3634 3635 basenameInodeLock, err := vS.inodeVolumeHandle.InitInodeLock(basenameInodeNumber, callerID) 3636 if err != nil { 3637 return 3638 } 3639 err = basenameInodeLock.WriteLock() 3640 if err != nil { 3641 return 3642 } 3643 defer basenameInodeLock.Unlock() 3644 3645 // no permissions are required on the target directory 3646 3647 err = vS.rmdirActual(inodeNumber, basename, basenameInodeNumber) 3648 return 3649 } 3650 3651 func (vS *volumeStruct) rmdirActual(inodeNumber inode.InodeNumber, 3652 basename string, basenameInodeNumber inode.InodeNumber) (err error) { 3653 3654 basenameInodeType, err := vS.inodeVolumeHandle.GetType(basenameInodeNumber) 3655 if nil != err { 3656 return 3657 } 3658 3659 if inode.DirType != basenameInodeType { 3660 err = fmt.Errorf("Rmdir() called on non-Directory") 3661 err = blunder.AddError(err, blunder.NotDirError) 3662 return 3663 } 3664 3665 dirEntries, err := vS.inodeVolumeHandle.NumDirEntries(basenameInodeNumber) 3666 if nil != err { 3667 return 3668 } 3669 3670 if 2 != dirEntries { 3671 err = fmt.Errorf("Directory not empty") 3672 err = blunder.AddError(err, blunder.NotEmptyError) 3673 return 3674 } 3675 3676 err = vS.inodeVolumeHandle.Unlink(inodeNumber, basename, false) 3677 if nil != err { 3678 return 3679 } 3680 3681 err = vS.inodeVolumeHandle.Destroy(basenameInodeNumber) 3682 if nil != err { 3683 return 3684 } 3685 3686 return 3687 } 3688 3689 func (vS *volumeStruct) Setstat(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, stat Stat) (err error) { 3690 startTime := time.Now() 3691 defer func() { 3692 globals.SetstatUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3693 if err != nil { 3694 globals.SetstatErrors.Add(1) 3695 } 3696 }() 3697 3698 vS.jobRWMutex.RLock() 3699 defer vS.jobRWMutex.RUnlock() 3700 3701 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 3702 if err != nil { 3703 return 3704 } 3705 err = inodeLock.WriteLock() 3706 if err != nil { 3707 return 3708 } 3709 defer inodeLock.Unlock() 3710 3711 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.P_OK, 3712 inode.NoOverride) { 3713 err = blunder.NewError(blunder.NotPermError, "EPERM") 3714 return 3715 } 3716 3717 // perform all permissions checks before making any changes 3718 // 3719 // changing the filesize requires write permission 3720 _, ok := stat[StatSize] 3721 if ok { 3722 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK, 3723 inode.OwnerOverride) { 3724 err = blunder.NewError(blunder.NotPermError, "EPERM") 3725 return 3726 } 3727 } 3728 3729 // most other attributes can only be changed by the owner of the file 3730 ownerOnly := []StatKey{StatCTime, StatCRTime, StatMTime, StatATime, StatMode, StatUserID, StatGroupID} 3731 for _, key := range ownerOnly { 3732 _, ok := stat[key] 3733 if ok { 3734 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.P_OK, 3735 inode.NoOverride) { 3736 err = blunder.NewError(blunder.NotPermError, "EPERM") 3737 return 3738 } 3739 break 3740 } 3741 } 3742 3743 // the superuser (root) is the only one that can change the owner of the file to a 3744 // different user, but the owner of the file can perform a no-op "change" in 3745 // ownership 3746 newUserID, settingUserID := stat[StatUserID] 3747 if settingUserID && userID != inode.InodeRootUserID { 3748 if userID != inode.InodeUserID(newUserID) { 3749 err = blunder.NewError(blunder.NotPermError, "EPERM") 3750 return 3751 } 3752 } 3753 3754 // the group can only be changed to the current group or another group the owner 3755 // is in (unless its the superuser asking) 3756 newGroupID, settingGroupID := stat[StatGroupID] 3757 if settingGroupID && groupID != inode.InodeGroupID(newGroupID) && userID != inode.InodeRootUserID { 3758 3759 err = blunder.NewError(blunder.NotPermError, "EPERM") 3760 for _, otherID := range otherGroupIDs { 3761 if inode.InodeGroupID(newGroupID) == otherID { 3762 err = nil 3763 break 3764 } 3765 } 3766 if err != nil { 3767 return 3768 } 3769 } 3770 3771 // sanity checks for invalid/illegal values 3772 if settingUserID { 3773 // Since we are using a uint64 to convey a uint32 value, make sure we didn't get something too big 3774 if newUserID > uint64(math.MaxUint32) { 3775 err = fmt.Errorf("%s: userID is too large - value is %v, max is %v.", utils.GetFnName(), newUserID, uint64(math.MaxUint32)) 3776 err = blunder.AddError(err, blunder.InvalidUserIDError) 3777 return 3778 } 3779 } 3780 3781 if settingGroupID { 3782 // Since we are using a uint64 to convey a uint32 value, make sure we didn't get something too big 3783 if newGroupID > uint64(math.MaxUint32) { 3784 err = fmt.Errorf("%s: groupID is too large - value is %v, max is %v.", utils.GetFnName(), newGroupID, uint64(math.MaxUint32)) 3785 err = blunder.AddError(err, blunder.InvalidGroupIDError) 3786 return 3787 } 3788 } 3789 3790 filePerm, settingFilePerm := stat[StatMode] 3791 if settingFilePerm { 3792 // Since we are using a uint64 to convey a 12 bit value, make sure we didn't get something too big 3793 if filePerm >= 1<<12 { 3794 err = fmt.Errorf("%s: filePerm is too large - value is %v, max is %v.", utils.GetFnName(), 3795 filePerm, 1<<12) 3796 err = blunder.AddError(err, blunder.InvalidFileModeError) 3797 return 3798 } 3799 } 3800 3801 // get to work setting things 3802 // 3803 // Set permissions, if present in the map 3804 if settingFilePerm { 3805 err = vS.inodeVolumeHandle.SetPermMode(inodeNumber, inode.InodeMode(filePerm)) 3806 if err != nil { 3807 logger.ErrorWithError(err) 3808 return err 3809 } 3810 } 3811 3812 // set owner and/or group owner, if present in the map 3813 err = nil 3814 if settingUserID && settingGroupID { 3815 err = vS.inodeVolumeHandle.SetOwnerUserIDGroupID(inodeNumber, inode.InodeUserID(newUserID), 3816 inode.InodeGroupID(newGroupID)) 3817 } else if settingUserID { 3818 err = vS.inodeVolumeHandle.SetOwnerUserID(inodeNumber, inode.InodeUserID(newUserID)) 3819 } else if settingGroupID { 3820 err = vS.inodeVolumeHandle.SetOwnerGroupID(inodeNumber, inode.InodeGroupID(newGroupID)) 3821 } 3822 if err != nil { 3823 logger.ErrorWithError(err) 3824 return 3825 } 3826 3827 // Set crtime, if present in the map 3828 crtime, ok := stat[StatCRTime] 3829 if ok { 3830 newCreationTime := time.Unix(0, int64(crtime)) 3831 err = vS.inodeVolumeHandle.SetCreationTime(inodeNumber, newCreationTime) 3832 if err != nil { 3833 logger.ErrorWithError(err) 3834 return err 3835 } 3836 } 3837 3838 // Set mtime, if present in the map 3839 mtime, ok := stat[StatMTime] 3840 if ok { 3841 newModificationTime := time.Unix(0, int64(mtime)) 3842 err = vS.inodeVolumeHandle.SetModificationTime(inodeNumber, newModificationTime) 3843 if err != nil { 3844 logger.ErrorWithError(err) 3845 return err 3846 } 3847 } 3848 3849 // Set atime, if present in the map 3850 atime, ok := stat[StatATime] 3851 if ok { 3852 newAccessTime := time.Unix(0, int64(atime)) 3853 err = vS.inodeVolumeHandle.SetAccessTime(inodeNumber, newAccessTime) 3854 if err != nil { 3855 logger.ErrorWithError(err) 3856 return err 3857 } 3858 } 3859 3860 // ctime is used to reliably determine whether the contents of a file 3861 // have changed so it cannot be altered by a client (some security 3862 // software depends on this) 3863 ctime, ok := stat[StatCTime] 3864 if ok { 3865 newAccessTime := time.Unix(0, int64(ctime)) 3866 logger.Infof("%s: ignoring attempt to change ctime to %v on volume '%s' inode %v", 3867 utils.GetFnName(), newAccessTime, vS.volumeName, inodeNumber) 3868 } 3869 3870 // Set size, if present in the map 3871 size, ok := stat[StatSize] 3872 if ok { 3873 err = vS.inodeVolumeHandle.SetSize(inodeNumber, size) 3874 if err != nil { 3875 logger.ErrorWithError(err) 3876 return err 3877 } 3878 } 3879 3880 return 3881 } 3882 3883 func (vS *volumeStruct) SetXAttr(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, streamName string, value []byte, flags int) (err error) { 3884 startTime := time.Now() 3885 defer func() { 3886 globals.SetXAttrUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3887 if err != nil { 3888 globals.SetXAttrErrors.Add(1) 3889 } 3890 }() 3891 3892 vS.jobRWMutex.RLock() 3893 defer vS.jobRWMutex.RUnlock() 3894 3895 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 3896 if err != nil { 3897 return 3898 } 3899 err = inodeLock.WriteLock() 3900 if err != nil { 3901 return 3902 } 3903 defer inodeLock.Unlock() 3904 3905 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 3906 inode.NoOverride) { 3907 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 3908 return 3909 } 3910 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK, 3911 inode.OwnerOverride) { 3912 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 3913 return 3914 } 3915 3916 switch flags { 3917 case SetXAttrCreateOrReplace: 3918 break 3919 case SetXAttrCreate: 3920 _, err = vS.GetXAttr(userID, groupID, otherGroupIDs, inodeNumber, streamName) 3921 if err == nil { 3922 return blunder.AddError(err, blunder.FileExistsError) 3923 } 3924 case SetXAttrReplace: 3925 _, err = vS.GetXAttr(userID, groupID, otherGroupIDs, inodeNumber, streamName) 3926 if err != nil { 3927 return blunder.AddError(err, blunder.StreamNotFound) 3928 } 3929 default: 3930 return blunder.AddError(err, blunder.InvalidArgError) 3931 } 3932 3933 err = vS.inodeVolumeHandle.PutStream(inodeNumber, streamName, value) 3934 if err != nil { 3935 logger.ErrorfWithError(err, "Failed to set XAttr %v to inode %v", streamName, inodeNumber) 3936 } 3937 3938 vS.untrackInFlightFileInodeData(inodeNumber, false) 3939 3940 return 3941 } 3942 3943 func (vS *volumeStruct) StatVfs() (statVFS StatVFS, err error) { 3944 startTime := time.Now() 3945 defer func() { 3946 globals.StatVfsUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3947 if err != nil { 3948 globals.StatVfsErrors.Add(1) 3949 } 3950 }() 3951 3952 vS.jobRWMutex.RLock() 3953 defer vS.jobRWMutex.RUnlock() 3954 3955 statVFS = make(map[StatVFSKey]uint64) 3956 3957 statVFS[StatVFSFilesystemID] = vS.inodeVolumeHandle.GetFSID() 3958 statVFS[StatVFSBlockSize] = vS.reportedBlockSize 3959 statVFS[StatVFSFragmentSize] = vS.reportedFragmentSize 3960 statVFS[StatVFSTotalBlocks] = vS.reportedNumBlocks 3961 statVFS[StatVFSFreeBlocks] = vS.reportedNumBlocks 3962 statVFS[StatVFSAvailBlocks] = vS.reportedNumBlocks 3963 statVFS[StatVFSTotalInodes] = vS.reportedNumInodes 3964 statVFS[StatVFSFreeInodes] = vS.reportedNumInodes 3965 statVFS[StatVFSAvailInodes] = vS.reportedNumInodes 3966 statVFS[StatVFSMountFlags] = 0 3967 statVFS[StatVFSMaxFilenameLen] = FileNameMax 3968 3969 return statVFS, nil 3970 } 3971 3972 func (vS *volumeStruct) Symlink(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, basename string, target string) (symlinkInodeNumber inode.InodeNumber, err error) { 3973 startTime := time.Now() 3974 defer func() { 3975 globals.SymlinkUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 3976 if err != nil { 3977 globals.SymlinkErrors.Add(1) 3978 } 3979 }() 3980 3981 vS.jobRWMutex.RLock() 3982 defer vS.jobRWMutex.RUnlock() 3983 3984 err = validateBaseName(basename) 3985 if err != nil { 3986 return 3987 } 3988 3989 err = validateFullPath(target) 3990 if err != nil { 3991 return 3992 } 3993 3994 // Mode for symlinks defaults to rwxrwxrwx, i.e. inode.PosixModePerm 3995 symlinkInodeNumber, err = vS.inodeVolumeHandle.CreateSymlink(target, inode.PosixModePerm, userID, groupID) 3996 if err != nil { 3997 return 3998 } 3999 4000 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 4001 if err != nil { 4002 return 4003 } 4004 err = inodeLock.WriteLock() 4005 if err != nil { 4006 return 4007 } 4008 defer inodeLock.Unlock() 4009 4010 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 4011 inode.NoOverride) { 4012 4013 destroyErr := vS.inodeVolumeHandle.Destroy(symlinkInodeNumber) 4014 if destroyErr != nil { 4015 logger.WarnfWithError(destroyErr, "couldn't destroy inode %v after failed Access(F_OK) in fs.Symlink", symlinkInodeNumber) 4016 } 4017 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 4018 return 4019 } 4020 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, 4021 inode.NoOverride) { 4022 4023 destroyErr := vS.inodeVolumeHandle.Destroy(symlinkInodeNumber) 4024 if destroyErr != nil { 4025 logger.WarnfWithError(destroyErr, "couldn't destroy inode %v after failed Access(W_OK|X_OK) in fs.Symlink", symlinkInodeNumber) 4026 } 4027 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 4028 return 4029 } 4030 4031 err = vS.inodeVolumeHandle.Link(inodeNumber, basename, symlinkInodeNumber, false) 4032 if err != nil { 4033 destroyErr := vS.inodeVolumeHandle.Destroy(symlinkInodeNumber) 4034 if destroyErr != nil { 4035 logger.WarnfWithError(destroyErr, "couldn't destroy inode %v after failed Link() in fs.Symlink", symlinkInodeNumber) 4036 } 4037 return 4038 } 4039 4040 return 4041 } 4042 4043 func (vS *volumeStruct) Unlink(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, basename string) (err error) { 4044 startTime := time.Now() 4045 defer func() { 4046 globals.UnlinkUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 4047 if err != nil { 4048 globals.UnlinkErrors.Add(1) 4049 } 4050 }() 4051 4052 vS.jobRWMutex.RLock() 4053 defer vS.jobRWMutex.RUnlock() 4054 4055 callerID := dlm.GenerateCallerID() 4056 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, callerID) 4057 if err != nil { 4058 return 4059 } 4060 err = inodeLock.WriteLock() 4061 if err != nil { 4062 return 4063 } 4064 defer inodeLock.Unlock() 4065 4066 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 4067 inode.NoOverride) { 4068 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 4069 return 4070 } 4071 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK|inode.X_OK, 4072 inode.NoOverride) { 4073 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 4074 return 4075 } 4076 4077 basenameInodeNumber, err := vS.inodeVolumeHandle.Lookup(inodeNumber, basename) 4078 if nil != err { 4079 return 4080 } 4081 4082 basenameInodeLock, err := vS.inodeVolumeHandle.InitInodeLock(basenameInodeNumber, callerID) 4083 if err != nil { 4084 return 4085 } 4086 err = basenameInodeLock.WriteLock() 4087 if err != nil { 4088 return 4089 } 4090 defer basenameInodeLock.Unlock() 4091 4092 err = vS.unlinkActual(inodeNumber, basename, basenameInodeNumber) 4093 return 4094 } 4095 4096 func (vS *volumeStruct) unlinkActual(inodeNumber inode.InodeNumber, 4097 basename string, basenameInodeNumber inode.InodeNumber) (err error) { 4098 4099 basenameInodeType, err := vS.inodeVolumeHandle.GetType(basenameInodeNumber) 4100 if nil != err { 4101 return 4102 } 4103 4104 if inode.DirType == basenameInodeType { 4105 err = fmt.Errorf("Unlink() called on a Directory") 4106 err = blunder.AddError(err, blunder.IsDirError) 4107 return 4108 } 4109 4110 err = vS.inodeVolumeHandle.Unlink(inodeNumber, basename, false) 4111 if nil != err { 4112 return 4113 } 4114 4115 basenameLinkCount, err := vS.inodeVolumeHandle.GetLinkCount(basenameInodeNumber) 4116 if nil != err { 4117 return 4118 } 4119 4120 if 0 == basenameLinkCount { 4121 vS.untrackInFlightFileInodeData(basenameInodeNumber, false) 4122 err = vS.inodeVolumeHandle.Destroy(basenameInodeNumber) 4123 if nil != err { 4124 return 4125 } 4126 } 4127 4128 return 4129 } 4130 4131 func (vS *volumeStruct) VolumeName() (volumeName string) { 4132 startTime := time.Now() 4133 4134 volumeName = vS.volumeName 4135 globals.VolumeNameUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 4136 return 4137 } 4138 4139 func (vS *volumeStruct) Write(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, offset uint64, buf []byte, profiler *utils.Profiler) (size uint64, err error) { 4140 startTime := time.Now() 4141 defer func() { 4142 globals.WriteUsec.Add(uint64(time.Since(startTime) / time.Microsecond)) 4143 globals.WriteBytes.Add(size) 4144 if err != nil { 4145 globals.WriteErrors.Add(1) 4146 } 4147 }() 4148 4149 vS.jobRWMutex.RLock() 4150 defer vS.jobRWMutex.RUnlock() 4151 4152 logger.Tracef("fs.Write(): starting volume '%s' inode %v offset %v len %v", 4153 vS.volumeName, inodeNumber, offset, len(buf)) 4154 4155 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 4156 if err != nil { 4157 return 4158 } 4159 err = inodeLock.WriteLock() 4160 if err != nil { 4161 return 4162 } 4163 defer inodeLock.Unlock() 4164 4165 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 4166 inode.NoOverride) { 4167 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 4168 return 4169 } 4170 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK, 4171 inode.OwnerOverride) { 4172 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 4173 return 4174 } 4175 4176 profiler.AddEventNow("before inode.Write()") 4177 err = vS.inodeVolumeHandle.Write(inodeNumber, offset, buf, profiler) 4178 profiler.AddEventNow("after inode.Write()") 4179 // write to Swift presumably succeeds or fails as a whole 4180 if err != nil { 4181 return 0, err 4182 } 4183 4184 logger.Tracef("fs.Write(): tracking write volume '%s' inode %v", vS.volumeName, inodeNumber) 4185 vS.trackInFlightFileInodeData(inodeNumber) 4186 size = uint64(len(buf)) 4187 4188 return 4189 } 4190 4191 func (vS *volumeStruct) Wrote(userID inode.InodeUserID, groupID inode.InodeGroupID, otherGroupIDs []inode.InodeGroupID, inodeNumber inode.InodeNumber, containerName string, objectName string, fileOffset []uint64, objectOffset []uint64, length []uint64) (err error) { 4192 vS.jobRWMutex.RLock() 4193 defer vS.jobRWMutex.RUnlock() 4194 4195 inodeLock, err := vS.inodeVolumeHandle.InitInodeLock(inodeNumber, nil) 4196 if err != nil { 4197 return 4198 } 4199 err = inodeLock.WriteLock() 4200 if err != nil { 4201 return 4202 } 4203 defer inodeLock.Unlock() 4204 4205 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.F_OK, 4206 inode.NoOverride) { 4207 err = blunder.NewError(blunder.NotFoundError, "ENOENT") 4208 return 4209 } 4210 if !vS.inodeVolumeHandle.Access(inodeNumber, userID, groupID, otherGroupIDs, inode.W_OK, 4211 inode.OwnerOverride) { 4212 err = blunder.NewError(blunder.PermDeniedError, "EACCES") 4213 return 4214 } 4215 4216 err = vS.inodeVolumeHandle.Flush(inodeNumber, false) 4217 vS.untrackInFlightFileInodeData(inodeNumber, false) 4218 4219 err = vS.inodeVolumeHandle.Wrote(inodeNumber, containerName, objectName, fileOffset, objectOffset, length, true) 4220 4221 return // err, as set by inode.Wrote(), is sufficient 4222 } 4223 4224 func validateBaseName(baseName string) (err error) { 4225 // Make sure the file baseName is not too long 4226 baseLen := len(baseName) 4227 if baseLen > FileNameMax { 4228 err = fmt.Errorf("%s: basename is too long. Length %v, max %v", utils.GetFnName(), baseLen, FileNameMax) 4229 logger.ErrorWithError(err) 4230 return blunder.AddError(err, blunder.NameTooLongError) 4231 } 4232 return 4233 } 4234 4235 func validateFullPath(fullPath string) (err error) { 4236 pathLen := len(fullPath) 4237 if pathLen > FilePathMax { 4238 err = fmt.Errorf("%s: fullpath is too long. Length %v, max %v", utils.GetFnName(), pathLen, FilePathMax) 4239 logger.ErrorWithError(err) 4240 return blunder.AddError(err, blunder.NameTooLongError) 4241 } 4242 return 4243 } 4244 4245 func revSplitPath(fullpath string) []string { 4246 // TrimPrefix avoids empty [0] element in pathSegments 4247 trimmed := strings.TrimPrefix(fullpath, "/") 4248 if trimmed == "" { 4249 // path.Clean("") = ".", which is not useful 4250 return []string{} 4251 } 4252 4253 segments := strings.Split(path.Clean(trimmed), "/") 4254 slen := len(segments) 4255 for i := 0; i < slen/2; i++ { 4256 segments[i], segments[slen-i-1] = segments[slen-i-1], segments[i] 4257 } 4258 return segments 4259 } 4260 4261 // Utility function to unlink, but not destroy, a particular file or empty subdirectory. 4262 // 4263 // This function checks that the directory is empty. 4264 // 4265 // The caller of this function must hold appropriate locks. 4266 // 4267 // obstacleInodeNumber must refer to an existing file or directory 4268 // that is (a) already part of the directory tree and (b) not the root 4269 // directory. 4270 func (vS *volumeStruct) removeObstacleToObjectPut(callerID dlm.CallerID, dirInodeNumber inode.InodeNumber, obstacleName string, obstacleInodeNumber inode.InodeNumber) error { 4271 statResult, err := vS.getstatHelper(obstacleInodeNumber, callerID) 4272 if err != nil { 4273 return err 4274 } 4275 4276 fileType := inode.InodeType(statResult[StatFType]) 4277 if fileType == inode.FileType || fileType == inode.SymlinkType { 4278 // Files and symlinks can always, barring errors, be unlinked 4279 err = vS.inodeVolumeHandle.Unlink(dirInodeNumber, obstacleName, false) 4280 if err != nil { 4281 return err 4282 } 4283 } else if fileType == inode.DirType { 4284 numEntries, err := vS.inodeVolumeHandle.NumDirEntries(obstacleInodeNumber) 4285 if err != nil { 4286 return err 4287 } 4288 if numEntries >= 3 { 4289 // We're looking at a pre-existing, user-visible directory 4290 // that's linked into the directory structure, so we've 4291 // got at least two entries, namely "." and ".." 4292 // 4293 // If there's a third, then the directory is non-empty. 4294 return blunder.NewError(blunder.NotEmptyError, "%s is a non-empty directory", obstacleName) 4295 4296 } else { 4297 // We don't want to call Rmdir() here since 4298 // that function (a) grabs locks, (b) checks 4299 // that it's a directory and is empty, then 4300 // (c) calls Unlink() and Destroy(). 4301 // 4302 // We already have the locks and we've already 4303 // checked that it's empty, so let's just get 4304 // down to it. 4305 err = vS.inodeVolumeHandle.Unlink(dirInodeNumber, obstacleName, false) 4306 if err != nil { 4307 return err 4308 } 4309 } 4310 } 4311 return nil 4312 } 4313 4314 // Utility function to append entries to reply 4315 func appendReadPlanEntries(readPlan []inode.ReadPlanStep, readRangeOut *[]inode.ReadPlanStep) (numEntries uint64) { 4316 for i := range readPlan { 4317 entry := inode.ReadPlanStep{ObjectPath: readPlan[i].ObjectPath, Offset: readPlan[i].Offset, Length: readPlan[i].Length} 4318 *readRangeOut = append(*readRangeOut, entry) 4319 numEntries++ 4320 } 4321 return 4322 }