github.com/scaleoutsean/fusego@v0.0.0-20220224074057-4a6429e46bb8/connection.go (about) 1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package fuse 16 17 import ( 18 "context" 19 "fmt" 20 "io" 21 "log" 22 "os" 23 "path" 24 "runtime" 25 "sync" 26 "syscall" 27 28 "github.com/scaleoutsean/fusego/fuseops" 29 "github.com/scaleoutsean/fusego/internal/buffer" 30 "github.com/scaleoutsean/fusego/internal/freelist" 31 "github.com/scaleoutsean/fusego/internal/fusekernel" 32 ) 33 34 type contextKeyType uint64 35 36 var contextKey interface{} = contextKeyType(0) 37 38 // Ask the Linux kernel for larger read requests. 39 // 40 // As of 2015-03-26, the behavior in the kernel is: 41 // 42 // * (http://goo.gl/bQ1f1i, http://goo.gl/HwBrR6) Set the local variable 43 // ra_pages to be init_response->max_readahead divided by the page size. 44 // 45 // * (http://goo.gl/gcIsSh, http://goo.gl/LKV2vA) Set 46 // backing_dev_info::ra_pages to the min of that value and what was sent 47 // in the request's max_readahead field. 48 // 49 // * (http://goo.gl/u2SqzH) Use backing_dev_info::ra_pages when deciding 50 // how much to read ahead. 51 // 52 // * (http://goo.gl/JnhbdL) Don't read ahead at all if that field is zero. 53 // 54 // Reading a page at a time is a drag. Ask for a larger size. 55 const maxReadahead = 1 << 20 56 57 // Connection represents a connection to the fuse kernel process. It is used to 58 // receive and reply to requests from the kernel. 59 type Connection struct { 60 cfg MountConfig 61 debugLogger *log.Logger 62 errorLogger *log.Logger 63 64 // The device through which we're talking to the kernel, and the protocol 65 // version that we're using to talk to it. 66 dev *os.File 67 protocol fusekernel.Protocol 68 69 mu sync.Mutex 70 71 // A map from fuse "unique" request ID (*not* the op ID for logging used 72 // above) to a function that cancel's its associated context. 73 // 74 // GUARDED_BY(mu) 75 cancelFuncs map[uint64]func() 76 77 // Freelists, serviced by freelists.go. 78 inMessages freelist.Freelist // GUARDED_BY(mu) 79 outMessages freelist.Freelist // GUARDED_BY(mu) 80 } 81 82 // State that is maintained for each in-flight op. This is stuffed into the 83 // context that the user uses to reply to the op. 84 type opState struct { 85 inMsg *buffer.InMessage 86 outMsg *buffer.OutMessage 87 op interface{} 88 } 89 90 // Create a connection wrapping the supplied file descriptor connected to the 91 // kernel. You must eventually call c.close(). 92 // 93 // The loggers may be nil. 94 func newConnection( 95 cfg MountConfig, 96 debugLogger *log.Logger, 97 errorLogger *log.Logger, 98 dev *os.File) (*Connection, error) { 99 c := &Connection{ 100 cfg: cfg, 101 debugLogger: debugLogger, 102 errorLogger: errorLogger, 103 dev: dev, 104 cancelFuncs: make(map[uint64]func()), 105 } 106 107 // Initialize. 108 if err := c.Init(); err != nil { 109 c.close() 110 return nil, fmt.Errorf("Init: %v", err) 111 } 112 113 return c, nil 114 } 115 116 // Init performs the work necessary to cause the mount process to complete. 117 func (c *Connection) Init() error { 118 // Read the init op. 119 ctx, op, err := c.ReadOp() 120 if err != nil { 121 return fmt.Errorf("Reading init op: %v", err) 122 } 123 124 initOp, ok := op.(*initOp) 125 if !ok { 126 c.Reply(ctx, syscall.EPROTO) 127 return fmt.Errorf("Expected *initOp, got %T", op) 128 } 129 130 // Make sure the protocol version spoken by the kernel is new enough. 131 min := fusekernel.Protocol{ 132 fusekernel.ProtoVersionMinMajor, 133 fusekernel.ProtoVersionMinMinor, 134 } 135 136 if initOp.Kernel.LT(min) { 137 c.Reply(ctx, syscall.EPROTO) 138 return fmt.Errorf("Version too old: %v", initOp.Kernel) 139 } 140 141 // Downgrade our protocol if necessary. 142 c.protocol = fusekernel.Protocol{ 143 fusekernel.ProtoVersionMaxMajor, 144 fusekernel.ProtoVersionMaxMinor, 145 } 146 147 if initOp.Kernel.LT(c.protocol) { 148 c.protocol = initOp.Kernel 149 } 150 151 cacheSymlinks := initOp.Flags&fusekernel.InitCacheSymlinks > 0 152 noOpenSupport := initOp.Flags&fusekernel.InitNoOpenSupport > 0 153 noOpendirSupport := initOp.Flags&fusekernel.InitNoOpendirSupport > 0 154 155 // Respond to the init op. 156 initOp.Library = c.protocol 157 initOp.MaxReadahead = maxReadahead 158 initOp.MaxWrite = buffer.MaxWriteSize 159 160 initOp.Flags = 0 161 162 // Tell the kernel not to use pitifully small 4 KiB writes. 163 initOp.Flags |= fusekernel.InitBigWrites 164 // kernel 4.20 increases the max from 32 -> 256 165 initOp.Flags |= fusekernel.InitMaxPages 166 initOp.MaxPages = 256 167 168 // Enable writeback caching if the user hasn't asked us not to. 169 if !c.cfg.DisableWritebackCaching { 170 initOp.Flags |= fusekernel.InitWritebackCache 171 } 172 173 // Enable caching symlink targets in the kernel page cache if the user opted 174 // into it (might require fixing the size field of inode attributes first): 175 if c.cfg.EnableSymlinkCaching && cacheSymlinks { 176 initOp.Flags |= fusekernel.InitCacheSymlinks 177 } 178 179 // Tell the kernel to treat returning -ENOSYS on OpenFile as not needing 180 // OpenFile calls at all (Linux >= 3.16): 181 if c.cfg.EnableNoOpenSupport && noOpenSupport { 182 initOp.Flags |= fusekernel.InitNoOpenSupport 183 } 184 185 // Tell the kernel to treat returning -ENOSYS on OpenDir as not needing 186 // OpenDir calls at all (Linux >= 5.1): 187 if c.cfg.EnableNoOpendirSupport && noOpendirSupport { 188 initOp.Flags |= fusekernel.InitNoOpendirSupport 189 } 190 191 c.Reply(ctx, nil) 192 return nil 193 } 194 195 // Log information for an operation with the given ID. calldepth is the depth 196 // to use when recovering file:line information with runtime.Caller. 197 func (c *Connection) debugLog( 198 fuseID uint64, 199 calldepth int, 200 format string, 201 v ...interface{}) { 202 if c.debugLogger == nil { 203 return 204 } 205 206 // Get file:line info. 207 var file string 208 var line int 209 var ok bool 210 211 _, file, line, ok = runtime.Caller(calldepth) 212 if !ok { 213 file = "???" 214 } 215 216 fileLine := fmt.Sprintf("%v:%v", path.Base(file), line) 217 218 // Format the actual message to be printed. 219 msg := fmt.Sprintf( 220 "Op 0x%08x %24s] %v", 221 fuseID, 222 fileLine, 223 fmt.Sprintf(format, v...)) 224 225 // Print it. 226 c.debugLogger.Println(msg) 227 } 228 229 // LOCKS_EXCLUDED(c.mu) 230 func (c *Connection) recordCancelFunc( 231 fuseID uint64, 232 f func()) { 233 c.mu.Lock() 234 defer c.mu.Unlock() 235 236 if _, ok := c.cancelFuncs[fuseID]; ok { 237 panic(fmt.Sprintf("Already have cancel func for request %v", fuseID)) 238 } 239 240 c.cancelFuncs[fuseID] = f 241 } 242 243 // Set up state for an op that is about to be returned to the user, given its 244 // underlying fuse opcode and request ID. 245 // 246 // Return a context that should be used for the op. 247 // 248 // LOCKS_EXCLUDED(c.mu) 249 func (c *Connection) beginOp( 250 opCode uint32, 251 fuseID uint64) context.Context { 252 // Start with the parent context. 253 ctx := c.cfg.OpContext 254 255 // Set up a cancellation function. 256 // 257 // Special case: On Darwin, osxfuse aggressively reuses "unique" request IDs. 258 // This matters for Forget requests, which have no reply associated and 259 // therefore have IDs that are immediately eligible for reuse. For these, we 260 // should not record any state keyed on their ID. 261 // 262 // Cf. https://github.com/osxfuse/osxfuse/issues/208 263 if opCode != fusekernel.OpForget { 264 var cancel func() 265 ctx, cancel = context.WithCancel(ctx) 266 c.recordCancelFunc(fuseID, cancel) 267 } 268 269 return ctx 270 } 271 272 // Clean up all state associated with an op to which the user has responded, 273 // given its underlying fuse opcode and request ID. This must be called before 274 // a response is sent to the kernel, to avoid a race where the request's ID 275 // might be reused by osxfuse. 276 // 277 // LOCKS_EXCLUDED(c.mu) 278 func (c *Connection) finishOp( 279 opCode uint32, 280 fuseID uint64) { 281 c.mu.Lock() 282 defer c.mu.Unlock() 283 284 // Even though the op is finished, context.WithCancel requires us to arrange 285 // for the cancellation function to be invoked. We also must remove it from 286 // our map. 287 // 288 // Special case: we don't do this for Forget requests. See the note in 289 // beginOp above. 290 if opCode != fusekernel.OpForget { 291 cancel, ok := c.cancelFuncs[fuseID] 292 if !ok { 293 panic(fmt.Sprintf("Unknown request ID in finishOp: %v", fuseID)) 294 } 295 296 cancel() 297 delete(c.cancelFuncs, fuseID) 298 } 299 } 300 301 // LOCKS_EXCLUDED(c.mu) 302 func (c *Connection) handleInterrupt(fuseID uint64) { 303 c.mu.Lock() 304 defer c.mu.Unlock() 305 306 // NOTE(jacobsa): fuse.txt in the Linux kernel documentation 307 // (https://goo.gl/H55Dnr) defines the kernel <-> userspace protocol for 308 // interrupts. 309 // 310 // In particular, my reading of it is that an interrupt request cannot be 311 // delivered to userspace before the original request. The part about the 312 // race and EAGAIN appears to be aimed at userspace programs that 313 // concurrently process requests (cf. http://goo.gl/BES2rs). 314 // 315 // So in this method if we can't find the ID to be interrupted, it means that 316 // the request has already been replied to. 317 // 318 // Cf. https://github.com/osxfuse/osxfuse/issues/208 319 // Cf. http://comments.gmane.org/gmane.comp.file-systems.fuse.devel/14675 320 cancel, ok := c.cancelFuncs[fuseID] 321 if !ok { 322 return 323 } 324 325 cancel() 326 } 327 328 // Read the next message from the kernel. The message must later be destroyed 329 // using destroyInMessage. 330 func (c *Connection) readMessage() (*buffer.InMessage, error) { 331 // Allocate a message. 332 m := c.getInMessage() 333 334 // Loop past transient errors. 335 for { 336 // Attempt a reaed. 337 err := m.Init(c.dev) 338 339 // Special cases: 340 // 341 // * ENODEV means fuse has hung up. 342 // 343 // * EINTR means we should try again. (This seems to happen often on 344 // OS X, cf. http://golang.org/issue/11180) 345 // 346 if pe, ok := err.(*os.PathError); ok { 347 switch pe.Err { 348 case syscall.ENODEV: 349 err = io.EOF 350 351 case syscall.EINTR: 352 err = nil 353 continue 354 } 355 } 356 357 if err != nil { 358 c.putInMessage(m) 359 return nil, err 360 } 361 362 return m, nil 363 } 364 } 365 366 // Write the supplied message to the kernel. 367 func (c *Connection) writeMessage(msg []byte) error { 368 // Avoid the retry loop in os.File.Write. 369 n, err := syscall.Write(int(c.dev.Fd()), msg) 370 if err != nil { 371 return err 372 } 373 374 if n != len(msg) { 375 return fmt.Errorf("Wrote %d bytes; expected %d", n, len(msg)) 376 } 377 378 return nil 379 } 380 381 // ReadOp consumes the next op from the kernel process, returning the op and a 382 // context that should be used for work related to the op. It returns io.EOF if 383 // the kernel has closed the connection. 384 // 385 // If err != nil, the user is responsible for later calling c.Reply with the 386 // returned context. 387 // 388 // This function delivers ops in exactly the order they are received from 389 // /dev/fuse. It must not be called multiple times concurrently. 390 // 391 // LOCKS_EXCLUDED(c.mu) 392 func (c *Connection) ReadOp() (_ context.Context, op interface{}, _ error) { 393 // Keep going until we find a request we know how to convert. 394 for { 395 // Read the next message from the kernel. 396 inMsg, err := c.readMessage() 397 if err != nil { 398 return nil, nil, err 399 } 400 401 // Convert the message to an op. 402 outMsg := c.getOutMessage() 403 op, err = convertInMessage(inMsg, outMsg, c.protocol) 404 if err != nil { 405 c.putOutMessage(outMsg) 406 return nil, nil, fmt.Errorf("convertInMessage: %v", err) 407 } 408 409 // Choose an ID for this operation for the purposes of logging, and log it. 410 if c.debugLogger != nil { 411 c.debugLog(inMsg.Header().Unique, 1, "<- %s", describeRequest(op)) 412 } 413 414 // Special case: handle interrupt requests inline. 415 if interruptOp, ok := op.(*interruptOp); ok { 416 c.handleInterrupt(interruptOp.FuseID) 417 continue 418 } 419 420 // Set up a context that remembers information about this op. 421 ctx := c.beginOp(inMsg.Header().Opcode, inMsg.Header().Unique) 422 ctx = context.WithValue(ctx, contextKey, opState{inMsg, outMsg, op}) 423 424 // Return the op to the user. 425 return ctx, op, nil 426 } 427 } 428 429 // Skip errors that happen as a matter of course, since they spook users. 430 func (c *Connection) shouldLogError( 431 op interface{}, 432 err error) bool { 433 // We don't log non-errors. 434 if err == nil { 435 return false 436 } 437 438 // We can't log if there's nothing to log to. 439 if c.errorLogger == nil { 440 return false 441 } 442 443 switch op.(type) { 444 case *fuseops.LookUpInodeOp: 445 // It is totally normal for the kernel to ask to look up an inode by name 446 // and find the name doesn't exist. For example, this happens when linking 447 // a new file. 448 if err == syscall.ENOENT { 449 return false 450 } 451 452 case *fuseops.GetXattrOp: 453 if err == syscall.ENODATA || err == syscall.ERANGE { 454 return false 455 } 456 case *unknownOp: 457 // Don't bother the user with methods we intentionally don't support. 458 if err == syscall.ENOSYS { 459 return false 460 } 461 } 462 463 return true 464 } 465 466 // Reply replies to an op previously read using ReadOp, with the supplied error 467 // (or nil if successful). The context must be the context returned by ReadOp. 468 // 469 // LOCKS_EXCLUDED(c.mu) 470 func (c *Connection) Reply(ctx context.Context, opErr error) { 471 // Extract the state we stuffed in earlier. 472 var key interface{} = contextKey 473 foo := ctx.Value(key) 474 state, ok := foo.(opState) 475 if !ok { 476 panic(fmt.Sprintf("Reply called with invalid context: %#v", ctx)) 477 } 478 479 op := state.op 480 inMsg := state.inMsg 481 outMsg := state.outMsg 482 fuseID := inMsg.Header().Unique 483 484 // Make sure we destroy the messages when we're done. 485 defer c.putInMessage(inMsg) 486 defer c.putOutMessage(outMsg) 487 488 // Clean up state for this op. 489 c.finishOp(inMsg.Header().Opcode, inMsg.Header().Unique) 490 491 // Debug logging 492 if c.debugLogger != nil { 493 if opErr == nil { 494 c.debugLog(fuseID, 1, "-> OK (%s)", describeResponse(op)) 495 } else { 496 c.debugLog(fuseID, 1, "-> Error: %q", opErr.Error()) 497 } 498 } 499 500 // Error logging 501 if c.shouldLogError(op, opErr) { 502 c.errorLogger.Printf("%T error: %v", op, opErr) 503 } 504 505 // Send the reply to the kernel, if one is required. 506 noResponse := c.kernelResponse(outMsg, inMsg.Header().Unique, op, opErr) 507 508 if !noResponse { 509 err := c.writeMessage(outMsg.Bytes()) 510 if err != nil && c.errorLogger != nil { 511 c.errorLogger.Printf("writeMessage: %v %v", err, outMsg.Bytes()) 512 } 513 } 514 } 515 516 // Close the connection. Must not be called until operations that were read 517 // from the connection have been responded to. 518 func (c *Connection) close() error { 519 // Posix doesn't say that close can be called concurrently with read or 520 // write, but luckily we exclude the possibility of a race by requiring the 521 // user to respond to all ops first. 522 return c.dev.Close() 523 }