github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/address_space.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kvm 16 17 import ( 18 "sync/atomic" 19 20 "github.com/SagerNet/gvisor/pkg/atomicbitops" 21 "github.com/SagerNet/gvisor/pkg/hostarch" 22 "github.com/SagerNet/gvisor/pkg/ring0/pagetables" 23 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 24 "github.com/SagerNet/gvisor/pkg/sentry/platform" 25 "github.com/SagerNet/gvisor/pkg/sync" 26 ) 27 28 // dirtySet tracks vCPUs for invalidation. 29 type dirtySet struct { 30 vCPUMasks []uint64 31 } 32 33 // forEach iterates over all CPUs in the dirty set. 34 // 35 //go:nosplit 36 func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { 37 for index := range ds.vCPUMasks { 38 mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0) 39 if mask != 0 { 40 for bit := 0; bit < 64; bit++ { 41 if mask&(1<<uint64(bit)) == 0 { 42 continue 43 } 44 id := 64*index + bit 45 fn(m.vCPUsByID[id]) 46 } 47 } 48 } 49 } 50 51 // mark marks the given vCPU as dirty and returns whether it was previously 52 // clean. Being previously clean implies that a flush is needed on entry. 53 func (ds *dirtySet) mark(c *vCPU) bool { 54 index := uint64(c.id) / 64 55 bit := uint64(1) << uint(c.id%64) 56 57 oldValue := atomic.LoadUint64(&ds.vCPUMasks[index]) 58 if oldValue&bit != 0 { 59 return false // Not clean. 60 } 61 62 // Set the bit unilaterally, and ensure that a flush takes place. Note 63 // that it's possible for races to occur here, but since the flush is 64 // taking place long after these lines there's no race in practice. 65 atomicbitops.OrUint64(&ds.vCPUMasks[index], bit) 66 return true // Previously clean. 67 } 68 69 // addressSpace is a wrapper for PageTables. 70 type addressSpace struct { 71 platform.NoAddressSpaceIO 72 73 // mu is the lock for modifications to the address space. 74 // 75 // Note that the page tables themselves are not locked. 76 mu sync.Mutex 77 78 // machine is the underlying machine. 79 machine *machine 80 81 // pageTables are for this particular address space. 82 pageTables *pagetables.PageTables 83 84 // dirtySet is the set of dirty vCPUs. 85 dirtySet *dirtySet 86 } 87 88 // Invalidate interrupts all dirty contexts. 89 func (as *addressSpace) Invalidate() { 90 as.mu.Lock() 91 defer as.mu.Unlock() 92 as.invalidate() 93 } 94 95 // Touch adds the given vCPU to the dirty list. 96 // 97 // The return value indicates whether a flush is required. 98 func (as *addressSpace) Touch(c *vCPU) bool { 99 return as.dirtySet.mark(c) 100 } 101 102 type hostMapEntry struct { 103 addr uintptr 104 length uintptr 105 } 106 107 // mapLocked maps the given host entry. 108 // 109 // +checkescape:hard,stack 110 // 111 //go:nosplit 112 func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) { 113 for m.length > 0 { 114 physical, length, ok := translateToPhysical(m.addr) 115 if !ok { 116 panic("unable to translate segment") 117 } 118 if length > m.length { 119 length = m.length 120 } 121 122 // Ensure that this map has physical mappings. If the page does 123 // not have physical mappings, the KVM module may inject 124 // spurious exceptions when emulation fails (i.e. it tries to 125 // emulate because the RIP is pointed at those pages). 126 as.machine.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE) 127 128 // Install the page table mappings. Note that the ordering is 129 // important; if the pagetable mappings were installed before 130 // ensuring the physical pages were available, then some other 131 // thread could theoretically access them. 132 inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ 133 AccessType: at, 134 User: true, 135 }, physical) || inv 136 m.addr += length 137 m.length -= length 138 addr += hostarch.Addr(length) 139 } 140 141 return inv 142 } 143 144 // MapFile implements platform.AddressSpace.MapFile. 145 func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { 146 as.mu.Lock() 147 defer as.mu.Unlock() 148 149 // Get mappings in the sentry's address space, which are guaranteed to be 150 // valid as long as a reference is held on the mapped pages (which is in 151 // turn required by AddressSpace.MapFile precondition). 152 // 153 // If precommit is true, we will touch mappings to commit them, so ensure 154 // that mappings are readable from sentry context. 155 // 156 // We don't execute from application file-mapped memory, and guest page 157 // tables don't care if we have execute permission (but they do need pages 158 // to be readable). 159 bs, err := f.MapInternal(fr, hostarch.AccessType{ 160 Read: at.Read || at.Execute || precommit, 161 Write: at.Write, 162 }) 163 if err != nil { 164 return err 165 } 166 167 // See block in mapLocked. 168 as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() 169 defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) 170 171 // Map the mappings in the sentry's address space (guest physical memory) 172 // into the application's address space (guest virtual memory). 173 inv := false 174 for !bs.IsEmpty() { 175 b := bs.Head() 176 bs = bs.Tail() 177 // Since fr was page-aligned, b should also be page-aligned. We do the 178 // lookup in our host page tables for this translation. 179 if precommit { 180 s := b.ToSlice() 181 for i := 0; i < len(s); i += hostarch.PageSize { 182 _ = s[i] // Touch to commit. 183 } 184 } 185 186 // See bluepill_allocator.go. 187 bluepill(as.pageTables.Allocator.(*allocator).cpu) 188 189 // Perform the mapping. 190 prev := as.mapLocked(addr, hostMapEntry{ 191 addr: b.Addr(), 192 length: uintptr(b.Len()), 193 }, at) 194 inv = inv || prev 195 addr += hostarch.Addr(b.Len()) 196 } 197 if inv { 198 as.invalidate() 199 } 200 201 return nil 202 } 203 204 // unmapLocked is an escape-checked wrapped around Unmap. 205 // 206 // +checkescape:hard,stack 207 // 208 //go:nosplit 209 func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool { 210 return as.pageTables.Unmap(addr, uintptr(length)) 211 } 212 213 // Unmap unmaps the given range by calling pagetables.PageTables.Unmap. 214 func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) { 215 as.mu.Lock() 216 defer as.mu.Unlock() 217 218 // See above & bluepill_allocator.go. 219 as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() 220 defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) 221 bluepill(as.pageTables.Allocator.(*allocator).cpu) 222 223 if prev := as.unmapLocked(addr, length); prev { 224 // Invalidate all active vCPUs. 225 as.invalidate() 226 227 // Recycle any freed intermediate pages. 228 as.pageTables.Allocator.Recycle() 229 } 230 } 231 232 // Release releases the page tables. 233 func (as *addressSpace) Release() { 234 as.Unmap(0, ^uint64(0)) 235 236 // Free all pages from the allocator. 237 as.pageTables.Allocator.(*allocator).base.Drain() 238 239 // Drop all cached machine references. 240 as.machine.dropPageTables(as.pageTables) 241 } 242 243 // PreFork implements platform.AddressSpace.PreFork. 244 func (as *addressSpace) PreFork() {} 245 246 // PostFork implements platform.AddressSpace.PostFork. 247 func (as *addressSpace) PostFork() {}