github.com/pawelgaczynski/giouring@v0.0.0-20230826085535-69588b89acb9/setup.go (about) 1 // MIT License 2 // 3 // Copyright (c) 2023 Paweł Gaczyński 4 // 5 // Permission is hereby granted, free of charge, to any person obtaining a 6 // copy of this software and associated documentation files (the 7 // "Software"), to deal in the Software without restriction, including 8 // without limitation the rights to use, copy, modify, merge, publish, 9 // distribute, sublicense, and/or sell copies of the Software, and to 10 // permit persons to whom the Software is furnished to do so, subject to 11 // the following conditions: 12 // 13 // The above copyright notice and this permission notice shall be included 14 // in all copies or substantial portions of the Software. 15 // 16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 24 package giouring 25 26 import ( 27 "math/bits" 28 "os" 29 "syscall" 30 "unsafe" 31 ) 32 33 const ( 34 kernMaxEntries = 32768 35 kernMaxCQEntries = 2 * kernMaxEntries 36 ) 37 38 func fls(x int) int { 39 if x == 0 { 40 return 0 41 } 42 43 return 8*int(unsafe.Sizeof(x)) - bits.LeadingZeros32(uint32(x)) 44 } 45 46 func roundupPow2(depth uint32) uint32 { 47 return 1 << uint32(fls(int(depth-1))) 48 } 49 50 const cqEntriesMultiplier = 2 51 52 // liburing: get_sq_cq_entries 53 func getSqCqEntries(entries uint32, p *Params, sq, cq *uint32) error { 54 var cqEntries uint32 55 56 if entries == 0 { 57 return syscall.EINVAL 58 } 59 if entries > kernMaxEntries { 60 if p.flags&SetupClamp == 0 { 61 return syscall.EINVAL 62 } 63 entries = kernMaxEntries 64 } 65 66 entries = roundupPow2(entries) 67 if p.flags&SetupCQSize != 0 { 68 if p.cqEntries == 0 { 69 return syscall.EINVAL 70 } 71 cqEntries = p.cqEntries 72 if cqEntries > kernMaxCQEntries { 73 if p.flags&SetupClamp == 0 { 74 return syscall.EINVAL 75 } 76 cqEntries = kernMaxCQEntries 77 } 78 cqEntries = roundupPow2(cqEntries) 79 if cqEntries < entries { 80 return syscall.EINVAL 81 } 82 } else { 83 cqEntries = cqEntriesMultiplier * entries 84 } 85 *sq = entries 86 *cq = cqEntries 87 88 return nil 89 } 90 91 // liburing: io_uring_unmap_rings 92 func UnmapRings(sq *SubmissionQueue, cq *CompletionQueue) { 93 if sq.ringSize > 0 { 94 _ = sysMunmap(uintptr(sq.ringPtr), uintptr(sq.ringSize)) 95 } 96 97 if uintptr(cq.ringPtr) != 0 && cq.ringSize > 0 && cq.ringPtr != sq.ringPtr { 98 _ = sysMunmap(uintptr(cq.ringPtr), uintptr(cq.ringSize)) 99 } 100 } 101 102 // liburing: io_uring_setup_ring_pointers 103 func SetupRingPointers(p *Params, sq *SubmissionQueue, cq *CompletionQueue) { 104 sq.head = (*uint32)(unsafe.Pointer(uintptr(sq.ringPtr) + uintptr(p.sqOff.head))) 105 sq.tail = (*uint32)(unsafe.Pointer(uintptr(sq.ringPtr) + uintptr(p.sqOff.tail))) 106 sq.ringMask = (*uint32)(unsafe.Pointer(uintptr(sq.ringPtr) + uintptr(p.sqOff.ringMask))) 107 sq.ringEntries = (*uint32)(unsafe.Pointer(uintptr(sq.ringPtr) + uintptr(p.sqOff.ringEntries))) 108 sq.flags = (*uint32)(unsafe.Pointer(uintptr(sq.ringPtr) + uintptr(p.sqOff.flags))) 109 sq.dropped = (*uint32)(unsafe.Pointer(uintptr(sq.ringPtr) + uintptr(p.sqOff.dropped))) 110 sq.array = (*uint32)(unsafe.Pointer(uintptr(sq.ringPtr) + uintptr(p.sqOff.array))) 111 112 cq.head = (*uint32)(unsafe.Pointer(uintptr(cq.ringPtr) + uintptr(p.cqOff.head))) 113 cq.tail = (*uint32)(unsafe.Pointer(uintptr(cq.ringPtr) + uintptr(p.cqOff.tail))) 114 cq.ringMask = (*uint32)(unsafe.Pointer(uintptr(cq.ringPtr) + uintptr(p.cqOff.ringMask))) 115 cq.ringEntries = (*uint32)(unsafe.Pointer(uintptr(cq.ringPtr) + uintptr(p.cqOff.ringEntries))) 116 cq.overflow = (*uint32)(unsafe.Pointer(uintptr(cq.ringPtr) + uintptr(p.cqOff.overflow))) 117 cq.cqes = (*CompletionQueueEvent)(unsafe.Pointer(uintptr(cq.ringPtr) + uintptr(p.cqOff.cqes))) 118 if p.cqOff.flags != 0 { 119 cq.flags = (*uint32)(unsafe.Pointer(uintptr(cq.ringPtr) + uintptr(p.cqOff.flags))) 120 } 121 } 122 123 // liburing: io_uring_mmap 124 func Mmap(fd int, p *Params, sq *SubmissionQueue, cq *CompletionQueue) error { 125 var size uintptr 126 var err error 127 128 size = unsafe.Sizeof(CompletionQueueEvent{}) 129 if p.flags&SetupCQE32 != 0 { 130 size += unsafe.Sizeof(CompletionQueueEvent{}) 131 } 132 133 sq.ringSize = uint(uintptr(p.sqOff.array) + uintptr(p.sqEntries)*unsafe.Sizeof(uint32(0))) 134 cq.ringSize = uint(uintptr(p.cqOff.cqes) + uintptr(p.cqEntries)*size) 135 136 if p.features&FeatSingleMMap != 0 { 137 if cq.ringSize > sq.ringSize { 138 sq.ringSize = cq.ringSize 139 } 140 cq.ringSize = sq.ringSize 141 } 142 143 var ringPtr uintptr 144 ringPtr, err = mmap(0, uintptr(sq.ringSize), syscall.PROT_READ|syscall.PROT_WRITE, 145 syscall.MAP_SHARED|syscall.MAP_POPULATE, fd, 146 int64(offsqRing)) 147 if err != nil { 148 return err 149 } 150 sq.ringPtr = unsafe.Pointer(ringPtr) 151 152 if p.features&FeatSingleMMap != 0 { 153 cq.ringPtr = sq.ringPtr 154 } else { 155 ringPtr, err = mmap(0, uintptr(cq.ringSize), syscall.PROT_READ|syscall.PROT_WRITE, 156 syscall.MAP_SHARED|syscall.MAP_POPULATE, fd, 157 int64(offcqRing)) 158 if err != nil { 159 cq.ringPtr = nil 160 161 goto err 162 } 163 cq.ringPtr = unsafe.Pointer(ringPtr) 164 } 165 166 size = unsafe.Sizeof(SubmissionQueueEntry{}) 167 if p.flags&SetupSQE128 != 0 { 168 size += 64 169 } 170 ringPtr, err = mmap(0, size*uintptr(p.sqEntries), syscall.PROT_READ|syscall.PROT_WRITE, 171 syscall.MAP_SHARED|syscall.MAP_POPULATE, fd, int64(offSQEs)) 172 if err != nil { 173 goto err 174 } 175 sq.sqes = (*SubmissionQueueEntry)(unsafe.Pointer(ringPtr)) 176 SetupRingPointers(p, sq, cq) 177 178 return nil 179 180 err: 181 UnmapRings(sq, cq) 182 183 return err 184 } 185 186 // liburing: io_uring_queue_mmap 187 func (ring *Ring) QueueMmap(fd int, p *Params) error { 188 return Mmap(fd, p, ring.sqRing, ring.cqRing) 189 } 190 191 // liburing: io_uring_ring_dontfork 192 func (ring *Ring) RingDontFork() error { 193 var length uintptr 194 var err error 195 196 if ring.sqRing.ringPtr == nil || ring.sqRing.sqes == nil || ring.cqRing.ringPtr == nil { 197 return syscall.EINVAL 198 } 199 200 length = unsafe.Sizeof(SubmissionQueueEntry{}) 201 if ring.flags&SetupSQE128 != 0 { 202 length += 64 203 } 204 length *= uintptr(*ring.sqRing.ringEntries) 205 err = sysMadvise(uintptr(unsafe.Pointer(ring.sqRing.sqes)), length, syscall.MADV_DONTFORK) 206 if err != nil { 207 return err 208 } 209 210 length = uintptr(ring.sqRing.ringSize) 211 err = sysMadvise(uintptr(ring.sqRing.ringPtr), length, syscall.MADV_DONTFORK) 212 if err != nil { 213 return err 214 } 215 216 if ring.cqRing.ringPtr != ring.sqRing.ringPtr { 217 length = uintptr(ring.cqRing.ringSize) 218 err = sysMadvise(uintptr(ring.cqRing.ringPtr), length, syscall.MADV_DONTFORK) 219 if err != nil { 220 return err 221 } 222 } 223 224 return nil 225 } 226 227 /* FIXME */ 228 const hugePageSize uint64 = 2 * 1024 * 1024 229 230 // liburing: io_uring_alloc_huge 231 func allocHuge( 232 entries uint32, p *Params, sq *SubmissionQueue, cq *CompletionQueue, buf unsafe.Pointer, bufSize uint64, 233 ) (uint, error) { 234 pageSize := uint64(os.Getpagesize()) 235 var sqEntries, cqEntries uint32 236 var ringMem, sqesMem uint64 237 var memUsed uint64 238 var ptr unsafe.Pointer 239 240 errno := getSqCqEntries(entries, p, &sqEntries, &cqEntries) 241 if errno != nil { 242 return 0, errno 243 } 244 245 sqesMem = uint64(sqEntries) * uint64(unsafe.Sizeof(SubmissionQueue{})) 246 sqesMem = (sqesMem + pageSize - 1) &^ (pageSize - 1) 247 ringMem = uint64(cqEntries) * uint64(unsafe.Sizeof(CompletionQueue{})) 248 if p.flags&SetupCQE32 != 0 { 249 ringMem *= 2 250 } 251 ringMem += uint64(sqEntries) * uint64(unsafe.Sizeof(uint32(0))) 252 memUsed = sqesMem + ringMem 253 memUsed = (memUsed + pageSize - 1) &^ (pageSize - 1) 254 255 if buf == nil && (sqesMem > hugePageSize || ringMem > hugePageSize) { 256 return 0, syscall.ENOMEM 257 } 258 259 if buf != nil { 260 if memUsed > bufSize { 261 return 0, syscall.ENOMEM 262 } 263 ptr = buf 264 } else { 265 var mapHugetlb int 266 if sqesMem <= pageSize { 267 bufSize = pageSize 268 } else { 269 bufSize = hugePageSize 270 mapHugetlb = syscall.MAP_HUGETLB 271 } 272 var err error 273 ptr, err = sysMmap( 274 0, uintptr(bufSize), 275 syscall.PROT_READ|syscall.PROT_WRITE, 276 syscall.MAP_SHARED|syscall.MAP_ANONYMOUS|mapHugetlb, -1, 0) 277 if err != nil { 278 return 0, err 279 } 280 } 281 282 sq.sqes = (*SubmissionQueueEntry)(ptr) 283 if memUsed <= bufSize { 284 sq.ringPtr = unsafe.Pointer(uintptr(unsafe.Pointer(sq.sqes)) + uintptr(sqesMem)) 285 cq.ringSize = 0 286 sq.ringSize = 0 287 } else { 288 var mapHugetlb int 289 if ringMem <= pageSize { 290 bufSize = pageSize 291 } else { 292 bufSize = hugePageSize 293 mapHugetlb = syscall.MAP_HUGETLB 294 } 295 var err error 296 ptr, err = sysMmap( 297 0, uintptr(bufSize), 298 syscall.PROT_READ|syscall.PROT_WRITE, 299 syscall.MAP_SHARED|syscall.MAP_ANONYMOUS|mapHugetlb, -1, 0) 300 if err != nil { 301 _ = sysMunmap(uintptr(unsafe.Pointer(sq.sqes)), 1) 302 303 return 0, err 304 } 305 sq.ringPtr = ptr 306 sq.ringSize = uint(bufSize) 307 cq.ringSize = 0 308 } 309 310 cq.ringPtr = sq.ringPtr 311 p.sqOff.userAddr = uint64(uintptr(unsafe.Pointer(sq.sqes))) 312 p.cqOff.userAddr = uint64(uintptr(sq.ringPtr)) 313 314 return uint(memUsed), nil 315 } 316 317 // liburing: __io_uring_queue_init_params 318 func (ring *Ring) internalQueueInitParams(entries uint32, p *Params, buf unsafe.Pointer, bufSize uint64) error { 319 var fd int 320 var sqEntries, index uint32 321 var err error 322 323 if p.flags&SetupRegisteredFdOnly != 0 && p.flags&SetupNoMmap == 0 { 324 return syscall.EINVAL 325 } 326 327 if p.flags&SetupNoMmap != 0 { 328 _, err = allocHuge(entries, p, ring.sqRing, ring.cqRing, buf, bufSize) 329 if err != nil { 330 return err 331 } 332 if buf != nil { 333 ring.intFlags |= IntFlagAppMem 334 } 335 } 336 337 fdPtr, _, errno := syscall.Syscall(sysSetup, uintptr(entries), uintptr(unsafe.Pointer(p)), 0) 338 if errno != 0 { 339 if p.flags&SetupNoMmap != 0 && ring.intFlags&IntFlagAppMem == 0 { 340 _ = sysMunmap(uintptr(unsafe.Pointer(ring.sqRing.sqes)), 1) 341 UnmapRings(ring.sqRing, ring.cqRing) 342 } 343 344 return errno 345 } 346 fd = int(fdPtr) 347 348 if p.flags&SetupNoMmap == 0 { 349 err = ring.QueueMmap(fd, p) 350 if err != nil { 351 syscall.Close(fd) 352 353 return err 354 } 355 } else { 356 SetupRingPointers(p, ring.sqRing, ring.cqRing) 357 } 358 359 sqEntries = *ring.sqRing.ringEntries 360 for index = 0; index < sqEntries; index++ { 361 *(*uint32)( 362 unsafe.Add(unsafe.Pointer(ring.sqRing.array), 363 index*uint32(unsafe.Sizeof(uint32(0))))) = index 364 } 365 366 ring.features = p.features 367 ring.flags = p.flags 368 ring.enterRingFd = fd 369 if p.flags&SetupRegisteredFdOnly != 0 { 370 ring.ringFd = -1 371 ring.intFlags |= IntFlagRegRing | IntFlagRegRegRing 372 } else { 373 ring.ringFd = fd 374 } 375 376 return nil 377 } 378 379 // liburing: io_uring_queue_init_mem 380 func (ring *Ring) QueueInitMem(entries uint32, p *Params, buf unsafe.Pointer, bufSize uint64) error { 381 // should already be set... 382 p.flags |= SetupNoMmap 383 384 return ring.internalQueueInitParams(entries, p, buf, bufSize) 385 } 386 387 // liburing: io_uring_queue_init_params - https://manpages.debian.org/unstable/liburing-dev/io_uring_queue_init_params.3.en.html 388 func (ring *Ring) QueueInitParams(entries uint32, p *Params) error { 389 return ring.internalQueueInitParams(entries, p, nil, 0) 390 } 391 392 // liburing: io_uring_queue_init - https://manpages.debian.org/unstable/liburing-dev/io_uring_queue_init.3.en.html 393 func (ring *Ring) QueueInit(entries uint32, flags uint32) error { 394 params := &Params{ 395 flags: flags, 396 } 397 398 return ring.QueueInitParams(entries, params) 399 } 400 401 // liburing: io_uring_queue_exit - https://manpages.debian.org/unstable/liburing-dev/io_uring_queue_exit.3.en.html 402 func (ring *Ring) QueueExit() { 403 sq := ring.sqRing 404 cq := ring.cqRing 405 var sqeSize uintptr 406 407 if sq.ringSize == 0 { 408 sqeSize = unsafe.Sizeof(SubmissionQueueEntry{}) 409 if ring.flags&SetupSQE128 != 0 { 410 sqeSize += 64 411 } 412 _ = sysMunmap(uintptr(unsafe.Pointer(sq.sqes)), sqeSize*uintptr(*sq.ringEntries)) 413 UnmapRings(sq, cq) 414 } else if ring.intFlags&IntFlagAppMem == 0 { 415 _ = sysMunmap(uintptr(unsafe.Pointer(sq.sqes)), uintptr(*sq.ringEntries)*unsafe.Sizeof(SubmissionQueueEntry{})) 416 UnmapRings(sq, cq) 417 } 418 419 if ring.intFlags&IntFlagRegRing != 0 { 420 _, _ = ring.UnregisterRingFd() 421 } 422 if ring.ringFd != -1 { 423 syscall.Close(ring.ringFd) 424 } 425 } 426 427 const ringSize = 320 428 429 func npages(size uint64, pageSize uint64) uint64 { 430 size-- 431 size /= pageSize 432 433 return uint64(fls(int(size))) 434 } 435 436 const ( 437 not63ul = 18446744073709551552 438 ringSizeCQOff = 63 439 ) 440 441 // liburing: rings_size 442 func ringsSize(p *Params, entries uint32, cqEntries uint32, pageSize uint64) uint64 { 443 var pages, sqSize, cqSize uint64 444 445 cqSize = uint64(unsafe.Sizeof(CompletionQueueEvent{})) 446 if p.flags&SetupCQE32 != 0 { 447 cqSize += uint64(unsafe.Sizeof(CompletionQueueEvent{})) 448 } 449 cqSize *= uint64(cqEntries) 450 cqSize += ringSize 451 cqSize = (cqSize + ringSizeCQOff) & not63ul 452 pages = 1 << npages(cqSize, pageSize) 453 454 sqSize = uint64(unsafe.Sizeof(SubmissionQueueEntry{})) 455 if p.flags&SetupSQE128 != 0 { 456 sqSize += 64 457 } 458 sqSize *= uint64(entries) 459 pages += 1 << npages(sqSize, pageSize) 460 461 return pages * pageSize 462 } 463 464 // liburing: io_uring_mlock_size_params 465 func MlockSizeParams(entries uint32, p *Params) (uint64, error) { 466 lp := &Params{} 467 ring := NewRing() 468 var cqEntries, sq uint32 469 var pageSize uint64 470 var err error 471 472 err = ring.QueueInitParams(entries, lp) 473 if err != nil { 474 ring.QueueExit() 475 } 476 477 if lp.features&FeatNativeWorkers != 0 { 478 return 0, nil 479 } 480 481 if entries == 0 { 482 return 0, syscall.EINVAL 483 } 484 if entries > kernMaxEntries { 485 if p.flags&SetupClamp == 0 { 486 return 0, syscall.EINVAL 487 } 488 entries = kernMaxEntries 489 } 490 491 err = getSqCqEntries(entries, p, &sq, &cqEntries) 492 if err != nil { 493 return 0, err 494 } 495 496 pageSize = uint64(os.Getpagesize()) 497 498 return ringsSize(p, sq, cqEntries, pageSize), nil 499 } 500 501 // liburing: io_uring_mlock_size 502 func MlockSize(entries, flags uint32) (uint64, error) { 503 p := &Params{} 504 p.flags = flags 505 506 return MlockSizeParams(entries, p) 507 } 508 509 // liburing: br_setup 510 func (ring *Ring) brSetup(nentries uint32, bgid uint16, flags uint32) (*BufAndRing, error) { 511 var br *BufAndRing 512 var reg BufReg 513 var ringSize, brPtr uintptr 514 var err error 515 516 reg = BufReg{} 517 ringSize = uintptr(nentries) * unsafe.Sizeof(BufAndRing{}) 518 brPtr, err = mmap( 519 0, ringSize, syscall.PROT_READ|syscall.PROT_WRITE, 520 syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE, -1, 0) 521 if err != nil { 522 return nil, err 523 } 524 br = (*BufAndRing)(unsafe.Pointer(brPtr)) 525 526 reg.RingAddr = uint64(uintptr(unsafe.Pointer(br))) 527 reg.RingEntries = nentries 528 reg.Bgid = bgid 529 530 _, err = ring.RegisterBufferRing(®, flags) 531 if err != nil { 532 _ = sysMunmap(uintptr(unsafe.Pointer(br)), ringSize) 533 534 return nil, err 535 } 536 537 return br, nil 538 } 539 540 // liburing: io_uring_setup_buf_ring - https://manpages.debian.org/unstable/liburing-dev/io_uring_setup_buf_ring.3.en.html 541 func (ring *Ring) SetupBufRing(nentries uint32, bgid int, flags uint32) (*BufAndRing, error) { 542 br, err := ring.brSetup(nentries, uint16(bgid), flags) 543 if br != nil { 544 br.BufRingInit() 545 } 546 547 return br, err 548 } 549 550 // liburing: io_uring_free_buf_ring - https://manpages.debian.org/unstable/liburing-dev/io_uring_free_buf_ring.3.en.html 551 func (ring *Ring) FreeBufRing(bgid int) error { 552 _, err := ring.UnregisterBufferRing(bgid) 553 554 return err 555 } 556 557 func (ring *Ring) RingFd() int { 558 return ring.ringFd 559 }