github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/executor/snapshot.h (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 #include <dirent.h> 5 #include <stdlib.h> 6 #include <sys/stat.h> 7 #include <sys/types.h> 8 #include <unistd.h> 9 10 #include <atomic> 11 #include <string> 12 #include <utility> 13 14 #ifndef MADV_POPULATE_WRITE 15 #define MADV_POPULATE_WRITE 23 16 #endif 17 18 // Size of qemu snapshots and time required to restore a snapshot depend on the amount of memory 19 // the VM touches after boot. For example, a 132 MB snapshot takes around 150ms to restore, 20 // while a 260 MB snapshot takes around 275 ms to restore. 21 // 22 // To reduce size of the snapshot it's recommended to use smaller kernel and setup fewer devices. 23 // For example the following cmdline arguments: 24 // "loop.max_loop=1 dummy_hcd.num=1 vivid.n_devs=2 vivid.multiplanar=1,2 netrom.nr_ndevs=1 rose.rose_ndevs=1" 25 // and CONFIG_USBIP_VHCI_NR_HCS=1 help to reduce snapshot by about 20 MB. Note: we have only 1 proc 26 // in snapshot mode, so we don't need lots of devices. However, our descriptions rely on vivid.n_devs=16 27 // since they hardcode names like /dev/video36 which follow after these 16 pre-created devices. 28 // 29 // Additionally we could try to use executor as init process, this should remove dhcpd/sshd/udevd/klogd/etc. 30 // We don't need even networking in snapshot mode since we communicate via shared memory. 31 32 static struct { 33 // Ivshmem interrupt doorbell register. 34 volatile uint32* doorbell; 35 volatile rpc::SnapshotHeaderT* hdr; 36 void* input; 37 } ivs; 38 39 // Finds qemu ivshmem device, see: 40 // https://www.qemu.org/docs/master/specs/ivshmem-spec.html 41 static void FindIvshmemDevices() 42 { 43 std::string result; 44 DIR* devices = opendir("/sys/bus/pci/devices"); 45 if (!devices) 46 fail("opendir(/sys/bus/pci/devices) failed"); 47 void* regs = nullptr; 48 void* input = nullptr; 49 void* output = nullptr; 50 while (auto* dev = readdir(devices)) { 51 if (dev->d_name[0] == '.') 52 continue; 53 const std::string& vendor = ReadTextFile("/sys/bus/pci/devices/%s/vendor", dev->d_name); 54 const std::string& device = ReadTextFile("/sys/bus/pci/devices/%s/device", dev->d_name); 55 debug("PCI device %s: vendor=%s device=%s\n", dev->d_name, vendor.c_str(), device.c_str()); 56 if (vendor != "0x1af4" || device != "0x1110") 57 continue; 58 char filename[1024]; 59 snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource2", dev->d_name); 60 int res2 = open(filename, O_RDWR); 61 if (res2 == -1) 62 fail("failed to open ivshmem resource2"); 63 struct stat statbuf; 64 if (fstat(res2, &statbuf)) 65 fail("failed to fstat ivshmem resource2"); 66 debug("ivshmem resource2 size %zu\n", static_cast<size_t>(statbuf.st_size)); 67 // The only way to distinguish the 2 ivshmem regions is by size. 68 if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotDoorbellSize)) { 69 snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource0", dev->d_name); 70 int res0 = open(filename, O_RDWR); 71 if (res0 == -1) 72 fail("failed to open ivshmem resource0"); 73 regs = mmap(nullptr, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, res0, 0); 74 close(res0); 75 if (regs == MAP_FAILED) 76 fail("failed to mmap ivshmem resource0"); 77 debug("mapped doorbell registers at %p\n", regs); 78 } else if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotShmemSize)) { 79 input = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxInputSize), 80 PROT_READ, MAP_SHARED, res2, 0); 81 output = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxOutputSize), 82 PROT_READ | PROT_WRITE, MAP_SHARED, res2, 83 static_cast<uint64>(rpc::Const::MaxInputSize)); 84 if (input == MAP_FAILED || output == MAP_FAILED) 85 fail("failed to mmap ivshmem resource2"); 86 debug("mapped shmem input at at %p/%llu\n", 87 input, static_cast<uint64>(rpc::Const::MaxInputSize)); 88 debug("mapped shmem output at at %p/%llu\n", 89 output, static_cast<uint64>(rpc::Const::MaxOutputSize)); 90 #if GOOS_linux 91 if (pkeys_enabled && pkey_mprotect(output, static_cast<uint64>(rpc::Const::MaxOutputSize), 92 PROT_READ | PROT_WRITE, RESERVED_PKEY)) 93 exitf("failed to pkey_mprotect output buffer"); 94 #endif 95 } 96 close(res2); 97 } 98 closedir(devices); 99 if (regs == nullptr || input == nullptr) 100 fail("cannot find ivshmem PCI devices"); 101 ivs.doorbell = static_cast<uint32*>(regs) + 3; 102 ivs.hdr = static_cast<rpc::SnapshotHeaderT*>(output); 103 ivs.input = input; 104 output_data = reinterpret_cast<OutputData*>(static_cast<char*>(output) + sizeof(rpc::SnapshotHeaderT)); 105 output_size = static_cast<uint64>(rpc::Const::MaxOutputSize) - sizeof(rpc::SnapshotHeaderT); 106 } 107 108 static void SnapshotSetup(char** argv, int argc) 109 { 110 flag_snapshot = true; 111 // This allows to see debug output during early setup. 112 // If debug is not actually enabled, it will be turned off in parse_handshake. 113 flag_debug = true; 114 #if GOOS_linux 115 // In snapshot mode executor output is redirected to /dev/kmsg. 116 // This is required to turn off rate limiting of writes. 117 write_file("/proc/sys/kernel/printk_devkmsg", "on\n"); 118 #endif 119 FindIvshmemDevices(); 120 // Wait for the host to write handshake_req into input memory. 121 while (ivs.hdr->state != rpc::SnapshotState::Handshake) 122 sleep_ms(10); 123 auto msg = flatbuffers::GetRoot<rpc::SnapshotHandshake>(ivs.input); 124 handshake_req req = { 125 .magic = kInMagic, 126 .use_cover_edges = msg->cover_edges(), 127 .is_kernel_64_bit = msg->kernel_64_bit(), 128 .flags = msg->env_flags(), 129 .pid = 0, 130 .sandbox_arg = static_cast<uint64>(msg->sandbox_arg()), 131 .syscall_timeout_ms = static_cast<uint64>(msg->syscall_timeout_ms()), 132 .program_timeout_ms = static_cast<uint64>(msg->program_timeout_ms()), 133 .slowdown_scale = static_cast<uint64>(msg->slowdown()), 134 }; 135 parse_handshake(req); 136 #if SYZ_HAVE_FEATURES 137 setup_sysctl(); 138 setup_cgroups(); 139 #endif 140 #if SYZ_HAVE_SETUP_EXT 141 // This can be defined in common_ext.h. 142 setup_ext(); 143 #endif 144 for (const auto& feat : features) { 145 if (!(msg->features() & feat.id)) 146 continue; 147 debug("setting up feature %s\n", rpc::EnumNameFeature(feat.id)); 148 const char* reason = feat.setup(); 149 if (reason) 150 failmsg("feature setup failed", "reason: %s", reason); 151 } 152 } 153 154 constexpr size_t kOutputPopulate = 256 << 10; 155 constexpr size_t kInputPopulate = 64 << 10; 156 constexpr size_t kGlobalsPopulate = 4 << 10; 157 constexpr size_t kDataPopulate = 8 << 10; 158 constexpr size_t kCoveragePopulate = 64 << 10; 159 constexpr size_t kThreadsPopulate = 2; 160 161 static void SnapshotSetState(rpc::SnapshotState state) 162 { 163 debug("changing stapshot state %s -> %s\n", 164 rpc::EnumNameSnapshotState(ivs.hdr->state), rpc::EnumNameSnapshotState(state)); 165 std::atomic_signal_fence(std::memory_order_seq_cst); 166 ivs.hdr->state = state; 167 // The register contains VM index shifted by 16 (the host part is VM index 1) 168 // + interrup vector index (0 in our case). 169 *ivs.doorbell = 1 << 16; 170 } 171 172 // PopulateMemory prefaults anon memory (we want to avoid minor page faults as well). 173 static void PopulateMemory(void* ptr, size_t size) 174 { 175 ptr = (void*)(uintptr_t(ptr) & ~(getpagesize() - 1)); 176 if (madvise(ptr, size, MADV_POPULATE_WRITE)) 177 failmsg("populate madvise failed", "ptr=%p size=%zu", ptr, size); 178 } 179 180 // TouchMemory prefaults non-anon shared memory. 181 static void TouchMemory(void* ptr, size_t size) 182 { 183 size_t const kPageSize = getpagesize(); 184 for (size_t i = 0; i < size; i += kPageSize) 185 (void)((volatile char*)ptr)[i]; 186 } 187 188 #if SYZ_EXECUTOR_USES_FORK_SERVER 189 static void SnapshotPrepareParent() 190 { 191 // This allows access to the output region. 192 CoverAccessScope scope(nullptr); 193 TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate); 194 // Notify SnapshotStart that we finished prefaulting memory in the parent. 195 output_data->completed = 1; 196 // Wait for the request to come, so that we give it full time slice to execute. 197 // This process will start waiting for the child as soon as we return. 198 while (ivs.hdr->state != rpc::SnapshotState::Execute) 199 ; 200 } 201 #endif 202 203 static void SnapshotStart() 204 { 205 debug("SnapshotStart\n"); 206 CoverAccessScope scope(nullptr); 207 // Prefault as much memory as we can before the snapshot is taken. 208 // Also pre-create some threads and let them block. 209 // This is intended to make execution after each snapshot restore faster, 210 // as we won't need to do that duplicate work again and again. 211 flag_threaded = true; 212 for (size_t i = 0; i < kThreadsPopulate; i++) { 213 thread_t* th = &threads[i]; 214 thread_create(th, i, flag_coverage); 215 if (flag_coverage) 216 PopulateMemory(th->cov.data, kCoveragePopulate); 217 } 218 TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate); 219 TouchMemory(ivs.input, kInputPopulate); 220 PopulateMemory(&flag_coverage, kGlobalsPopulate); 221 PopulateMemory((void*)SYZ_DATA_OFFSET, kDataPopulate); 222 sleep_ms(100); // let threads start and block 223 // Wait for the parent process to prefault as well. 224 while (!output_data->completed) 225 sleep_ms(1); 226 // Notify host that we are ready to be snapshotted. 227 SnapshotSetState(rpc::SnapshotState::Ready); 228 // Snapshot is restored here. 229 // First time we may loop here while the snapshot is taken, 230 // but afterwards we should be restored when the state is already Execute. 231 // Note: we don't use sleep in the loop because we may be snapshotted while in the sleep syscall. 232 // As the result each execution after snapshot restore will be slower as it will need to finish 233 // the sleep and return from the syscall. 234 while (ivs.hdr->state == rpc::SnapshotState::Ready) 235 ; 236 if (ivs.hdr->state == rpc::SnapshotState::Snapshotted) { 237 // First time around, just acknowledge and wait for snapshot restart. 238 SnapshotSetState(rpc::SnapshotState::Executed); 239 for (;;) 240 sleep(1000); 241 } 242 // Resumed for program execution. 243 output_data->Reset(); 244 auto msg = flatbuffers::GetRoot<rpc::SnapshotRequest>(ivs.input); 245 execute_req req = { 246 .magic = kInMagic, 247 .id = 0, 248 .type = rpc::RequestType::Program, 249 .exec_flags = static_cast<uint64>(msg->exec_flags()), 250 .all_call_signal = msg->all_call_signal(), 251 .all_extra_signal = msg->all_extra_signal(), 252 }; 253 parse_execute(req); 254 output_data->num_calls.store(msg->num_calls(), std::memory_order_relaxed); 255 input_data = const_cast<uint8*>(msg->prog_data()->Data()); 256 } 257 258 NORETURN static void SnapshotDone(bool failed) 259 { 260 debug("SnapshotDone\n"); 261 CoverAccessScope scope(nullptr); 262 uint32 num_calls = output_data->num_calls.load(std::memory_order_relaxed); 263 auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, false, nullptr); 264 ivs.hdr->output_offset = data.data() - reinterpret_cast<volatile uint8_t*>(ivs.hdr); 265 ivs.hdr->output_size = data.size(); 266 SnapshotSetState(failed ? rpc::SnapshotState::Failed : rpc::SnapshotState::Executed); 267 // Wait to be restarted from the snapshot. 268 for (;;) 269 sleep(1000); 270 }