github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/fuse/dev.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package fuse 16 17 import ( 18 "golang.org/x/sys/unix" 19 "github.com/metacubex/gvisor/pkg/abi/linux" 20 "github.com/metacubex/gvisor/pkg/context" 21 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 22 "github.com/metacubex/gvisor/pkg/sentry/kernel" 23 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 24 "github.com/metacubex/gvisor/pkg/sentry/vfs" 25 "github.com/metacubex/gvisor/pkg/sync" 26 "github.com/metacubex/gvisor/pkg/usermem" 27 "github.com/metacubex/gvisor/pkg/waiter" 28 ) 29 30 const fuseDevMinor = 229 31 32 // This is equivalent to linux.SizeOfFUSEHeaderIn 33 const fuseHeaderOutSize = 16 34 35 // fuseDevice implements vfs.Device for /dev/fuse. 36 // 37 // +stateify savable 38 type fuseDevice struct{} 39 40 // Open implements vfs.Device.Open. 41 func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 42 var fd DeviceFD 43 if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ 44 UseDentryMetadata: true, 45 }); err != nil { 46 return nil, err 47 } 48 return &fd.vfsfd, nil 49 } 50 51 // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse. 52 // 53 // +stateify savable 54 type DeviceFD struct { 55 vfsfd vfs.FileDescription 56 vfs.FileDescriptionDefaultImpl 57 vfs.DentryMetadataFileDescriptionImpl 58 vfs.NoLockFD 59 60 // waitQueue is used to notify interested parties when the device becomes 61 // readable or writable. 62 waitQueue waiter.Queue 63 64 // fullQueueCh is a channel used to synchronize the readers with the writers. 65 // Writers (inbound requests to the filesystem) block if there are too many 66 // unprocessed in-flight requests. 67 fullQueueCh chan struct{} `state:".(int)"` 68 69 // mu protects all the queues, maps, buffers and cursors and nextOpID. 70 mu sync.Mutex `state:"nosave"` 71 72 // nextOpID is used to create new requests. 73 // +checklocks:mu 74 nextOpID linux.FUSEOpID 75 76 // queue is the list of requests that need to be processed by the FUSE server. 77 // +checklocks:mu 78 queue requestList 79 80 // numActiveRequests is the number of requests made by the Sentry that has 81 // yet to be responded to. 82 // +checklocks:mu 83 numActiveRequests uint64 84 85 // completions is used to map a request to its response. A Writer will use this 86 // to notify the caller of a completed response. 87 // +checklocks:mu 88 completions map[linux.FUSEOpID]*futureResponse 89 90 // writeBuf is the memory buffer used to copy in the FUSE out header from 91 // userspace. 92 // +checklocks:mu 93 writeBuf [fuseHeaderOutSize]byte 94 95 // conn is the FUSE connection that this FD is being used for. 96 // +checklocks:mu 97 conn *connection 98 } 99 100 // Release implements vfs.FileDescriptionImpl.Release. 101 func (fd *DeviceFD) Release(ctx context.Context) { 102 fd.mu.Lock() 103 defer fd.mu.Unlock() 104 if fd.conn != nil { 105 fd.conn.mu.Lock() 106 fd.conn.connected = false 107 fd.conn.mu.Unlock() 108 109 fd.conn.Abort(ctx) // +checklocksforce: fd.conn.fd.mu=fd.mu 110 fd.waitQueue.Notify(waiter.ReadableEvents) 111 fd.conn = nil 112 } 113 } 114 115 // connected returns true if fd.conn is set and the connection has not been 116 // aborted. 117 // +checklocks:fd.mu 118 func (fd *DeviceFD) connected() bool { 119 if fd.conn != nil { 120 fd.conn.mu.Lock() 121 defer fd.conn.mu.Unlock() 122 return fd.conn.connected 123 } 124 return false 125 } 126 127 // PRead implements vfs.FileDescriptionImpl.PRead. 128 func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 129 // Operations on /dev/fuse don't make sense until a FUSE filesystem is 130 // mounted. If there is an active connection we know there is at least one 131 // filesystem mounted. 132 fd.mu.Lock() 133 defer fd.mu.Unlock() 134 if !fd.connected() { 135 return 0, linuxerr.EPERM 136 } 137 138 return 0, linuxerr.ENOSYS 139 } 140 141 // Read implements vfs.FileDescriptionImpl.Read. 142 func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 143 fd.mu.Lock() 144 defer fd.mu.Unlock() 145 if !fd.connected() { 146 return 0, linuxerr.EPERM 147 } 148 // We require that any Read done on this filesystem have a sane minimum 149 // read buffer. It must have the capacity for the fixed parts of any request 150 // header (Linux uses the request header and the FUSEWriteIn header for this 151 // calculation) + the negotiated MaxWrite room for the data. 152 minBuffSize := linux.FUSE_MIN_READ_BUFFER 153 fd.conn.mu.Lock() 154 negotiatedMinBuffSize := linux.SizeOfFUSEHeaderIn + linux.SizeOfFUSEHeaderOut + fd.conn.maxWrite 155 fd.conn.mu.Unlock() 156 if minBuffSize < negotiatedMinBuffSize { 157 minBuffSize = negotiatedMinBuffSize 158 } 159 160 // If the read buffer is too small, error out. 161 if dst.NumBytes() < int64(minBuffSize) { 162 return 0, linuxerr.EINVAL 163 } 164 // Find the first valid request. For the normal case this loop only executes 165 // once. 166 var req *Request 167 for req = fd.queue.Front(); !fd.queue.Empty(); req = fd.queue.Front() { 168 if int64(req.hdr.Len) <= dst.NumBytes() { 169 break 170 } 171 // The request is too large so we cannot process it. All requests must be 172 // smaller than the negotiated size as specified by Connection.MaxWrite set 173 // as part of the FUSE_INIT handshake. 174 errno := -int32(unix.EIO) 175 if req.hdr.Opcode == linux.FUSE_SETXATTR { 176 errno = -int32(unix.E2BIG) 177 } 178 179 if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil { 180 return 0, err 181 } 182 fd.queue.Remove(req) 183 req = nil 184 } 185 if req == nil { 186 return 0, linuxerr.ErrWouldBlock 187 } 188 189 // We already checked the size: dst must be able to fit the whole request. 190 n, err := dst.CopyOut(ctx, req.data) 191 if err != nil { 192 return 0, err 193 } 194 if n != len(req.data) { 195 return 0, linuxerr.EIO 196 } 197 fd.queue.Remove(req) 198 // Remove noReply ones from the map of requests expecting a reply. 199 if req.noReply { 200 fd.numActiveRequests-- 201 delete(fd.completions, req.hdr.Unique) 202 } 203 return int64(n), nil 204 } 205 206 // PWrite implements vfs.FileDescriptionImpl.PWrite. 207 func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 208 // Operations on /dev/fuse don't make sense until a FUSE filesystem is 209 // mounted. If there is an active connection we know there is at least one 210 // filesystem mounted. 211 fd.mu.Lock() 212 defer fd.mu.Unlock() 213 if !fd.connected() { 214 return 0, linuxerr.EPERM 215 } 216 217 return 0, linuxerr.ENOSYS 218 } 219 220 // Write implements vfs.FileDescriptionImpl.Write. 221 func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 222 fd.mu.Lock() 223 defer fd.mu.Unlock() 224 if !fd.connected() { 225 return 0, linuxerr.EPERM 226 } 227 228 n, err := src.CopyIn(ctx, fd.writeBuf[:]) 229 if err != nil { 230 return 0, err 231 } 232 var hdr linux.FUSEHeaderOut 233 hdr.UnmarshalBytes(fd.writeBuf[:]) 234 235 fut, ok := fd.completions[hdr.Unique] 236 if !ok { 237 // Server sent us a response for a request we never sent, or for which we 238 // already received a reply (e.g. aborted), an unlikely event. 239 return 0, linuxerr.EINVAL 240 } 241 delete(fd.completions, hdr.Unique) 242 243 // Copy over the header into the future response. The rest of the payload 244 // will be copied over to the FR's data in the next iteration. 245 fut.hdr = &hdr 246 fut.data = make([]byte, fut.hdr.Len) 247 copy(fut.data, fd.writeBuf[:]) 248 if fut.hdr.Len > uint32(len(fd.writeBuf)) { 249 src = src.DropFirst(len(fd.writeBuf)) 250 n2, err := src.CopyIn(ctx, fut.data[len(fd.writeBuf):]) 251 if err != nil { 252 return 0, err 253 } 254 n += n2 255 } 256 if err := fd.sendResponse(ctx, fut); err != nil { 257 return 0, err 258 } 259 return int64(n), nil 260 } 261 262 // Readiness implements vfs.FileDescriptionImpl.Readiness. 263 func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { 264 fd.mu.Lock() 265 defer fd.mu.Unlock() 266 var ready waiter.EventMask 267 268 if !fd.connected() { 269 ready |= waiter.EventErr 270 return ready & mask 271 } 272 273 // FD is always writable. 274 ready |= waiter.WritableEvents 275 if !fd.queue.Empty() { 276 // Have reqs available, FD is readable. 277 ready |= waiter.ReadableEvents 278 } 279 280 return ready & mask 281 } 282 283 // EventRegister implements waiter.Waitable.EventRegister. 284 func (fd *DeviceFD) EventRegister(e *waiter.Entry) error { 285 fd.mu.Lock() 286 defer fd.mu.Unlock() 287 fd.waitQueue.EventRegister(e) 288 return nil 289 } 290 291 // EventUnregister implements waiter.Waitable.EventUnregister. 292 func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { 293 fd.mu.Lock() 294 defer fd.mu.Unlock() 295 fd.waitQueue.EventUnregister(e) 296 } 297 298 // Epollable implements FileDescriptionImpl.Epollable. 299 func (fd *DeviceFD) Epollable() bool { 300 return true 301 } 302 303 // Seek implements vfs.FileDescriptionImpl.Seek. 304 func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 305 // Operations on /dev/fuse don't make sense until a FUSE filesystem is 306 // mounted. If there is an active connection we know there is at least one 307 // filesystem mounted. 308 fd.mu.Lock() 309 defer fd.mu.Unlock() 310 if !fd.connected() { 311 return 0, linuxerr.EPERM 312 } 313 314 return 0, linuxerr.ENOSYS 315 } 316 317 // sendResponse sends a response to the waiting task (if any). 318 // 319 // +checklocks:fd.mu 320 func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { 321 // Signal the task waiting on a response if any. 322 defer close(fut.ch) 323 324 // Signal that the queue is no longer full. 325 select { 326 case fd.fullQueueCh <- struct{}{}: 327 default: 328 } 329 fd.numActiveRequests-- 330 331 if fut.async { 332 return fd.asyncCallBack(ctx, fut.getResponse()) 333 } 334 335 return nil 336 } 337 338 // sendError sends an error response to the waiting task (if any) by calling sendResponse(). 339 // 340 // +checklocks:fd.mu 341 func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error { 342 // Return the error to the calling task. 343 respHdr := linux.FUSEHeaderOut{ 344 Len: linux.SizeOfFUSEHeaderOut, 345 Error: errno, 346 Unique: unique, 347 } 348 349 fut, ok := fd.completions[respHdr.Unique] 350 if !ok { 351 // A response for a request we never sent, 352 // or for which we already received a reply (e.g. aborted). 353 return linuxerr.EINVAL 354 } 355 delete(fd.completions, respHdr.Unique) 356 357 fut.hdr = &respHdr 358 return fd.sendResponse(ctx, fut) 359 } 360 361 // asyncCallBack executes pre-defined callback function for async requests. 362 // Currently used by: FUSE_INIT. 363 // +checklocks:fd.mu 364 func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error { 365 switch r.opcode { 366 case linux.FUSE_INIT: 367 creds := auth.CredentialsFromContext(ctx) 368 rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() 369 return fd.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) 370 // TODO(gvisor.dev/issue/3247): support async read: correctly process the response. 371 } 372 373 return nil 374 }