github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/executor/snapshot.h (about)

     1  // Copyright 2024 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  #include <dirent.h>
     5  #include <stdlib.h>
     6  #include <sys/stat.h>
     7  #include <sys/types.h>
     8  #include <unistd.h>
     9  
    10  #include <atomic>
    11  #include <string>
    12  #include <utility>
    13  
    14  #ifndef MADV_POPULATE_WRITE
    15  #define MADV_POPULATE_WRITE 23
    16  #endif
    17  
    18  // Size of qemu snapshots and time required to restore a snapshot depend on the amount of memory
    19  // the VM touches after boot. For example, a 132 MB snapshot takes around 150ms to restore,
    20  // while a 260 MB snapshot takes around 275 ms to restore.
    21  //
    22  // To reduce size of the snapshot it's recommended to use smaller kernel and setup fewer devices.
    23  // For example the following cmdline arguments:
    24  // "loop.max_loop=1 dummy_hcd.num=1 vivid.n_devs=2 vivid.multiplanar=1,2 netrom.nr_ndevs=1 rose.rose_ndevs=1"
    25  // and CONFIG_USBIP_VHCI_NR_HCS=1 help to reduce snapshot by about 20 MB. Note: we have only 1 proc
    26  // in snapshot mode, so we don't need lots of devices. However, our descriptions rely on vivid.n_devs=16
    27  // since they hardcode names like /dev/video36 which follow after these 16 pre-created devices.
    28  //
    29  // Additionally we could try to use executor as init process, this should remove dhcpd/sshd/udevd/klogd/etc.
    30  // We don't need even networking in snapshot mode since we communicate via shared memory.
    31  
    32  static struct {
    33  	// Ivshmem interrupt doorbell register.
    34  	volatile uint32* doorbell;
    35  	volatile rpc::SnapshotHeaderT* hdr;
    36  	void* input;
    37  } ivs;
    38  
    39  // Finds qemu ivshmem device, see:
    40  // https://www.qemu.org/docs/master/specs/ivshmem-spec.html
    41  static void FindIvshmemDevices()
    42  {
    43  	std::string result;
    44  	DIR* devices = opendir("/sys/bus/pci/devices");
    45  	if (!devices)
    46  		fail("opendir(/sys/bus/pci/devices) failed");
    47  	void* regs = nullptr;
    48  	void* input = nullptr;
    49  	void* output = nullptr;
    50  	while (auto* dev = readdir(devices)) {
    51  		if (dev->d_name[0] == '.')
    52  			continue;
    53  		const std::string& vendor = ReadTextFile("/sys/bus/pci/devices/%s/vendor", dev->d_name);
    54  		const std::string& device = ReadTextFile("/sys/bus/pci/devices/%s/device", dev->d_name);
    55  		debug("PCI device %s: vendor=%s device=%s\n", dev->d_name, vendor.c_str(), device.c_str());
    56  		if (vendor != "0x1af4" || device != "0x1110")
    57  			continue;
    58  		char filename[1024];
    59  		snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource2", dev->d_name);
    60  		int res2 = open(filename, O_RDWR);
    61  		if (res2 == -1)
    62  			fail("failed to open ivshmem resource2");
    63  		struct stat statbuf;
    64  		if (fstat(res2, &statbuf))
    65  			fail("failed to fstat ivshmem resource2");
    66  		debug("ivshmem resource2 size %zu\n", static_cast<size_t>(statbuf.st_size));
    67  		// The only way to distinguish the 2 ivshmem regions is by size.
    68  		if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotDoorbellSize)) {
    69  			snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource0", dev->d_name);
    70  			int res0 = open(filename, O_RDWR);
    71  			if (res0 == -1)
    72  				fail("failed to open ivshmem resource0");
    73  			regs = mmap(nullptr, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, res0, 0);
    74  			close(res0);
    75  			if (regs == MAP_FAILED)
    76  				fail("failed to mmap ivshmem resource0");
    77  			debug("mapped doorbell registers at %p\n", regs);
    78  		} else if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotShmemSize)) {
    79  			input = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxInputSize),
    80  				     PROT_READ, MAP_SHARED, res2, 0);
    81  			output = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxOutputSize),
    82  				      PROT_READ | PROT_WRITE, MAP_SHARED, res2,
    83  				      static_cast<uint64>(rpc::Const::MaxInputSize));
    84  			if (input == MAP_FAILED || output == MAP_FAILED)
    85  				fail("failed to mmap ivshmem resource2");
    86  			debug("mapped shmem input at at %p/%llu\n",
    87  			      input, static_cast<uint64>(rpc::Const::MaxInputSize));
    88  			debug("mapped shmem output at at %p/%llu\n",
    89  			      output, static_cast<uint64>(rpc::Const::MaxOutputSize));
    90  #if GOOS_linux
    91  			if (pkeys_enabled && pkey_mprotect(output, static_cast<uint64>(rpc::Const::MaxOutputSize),
    92  							   PROT_READ | PROT_WRITE, RESERVED_PKEY))
    93  				exitf("failed to pkey_mprotect output buffer");
    94  #endif
    95  		}
    96  		close(res2);
    97  	}
    98  	closedir(devices);
    99  	if (regs == nullptr || input == nullptr)
   100  		fail("cannot find ivshmem PCI devices");
   101  	ivs.doorbell = static_cast<uint32*>(regs) + 3;
   102  	ivs.hdr = static_cast<rpc::SnapshotHeaderT*>(output);
   103  	ivs.input = input;
   104  	output_data = reinterpret_cast<OutputData*>(static_cast<char*>(output) + sizeof(rpc::SnapshotHeaderT));
   105  	output_size = static_cast<uint64>(rpc::Const::MaxOutputSize) - sizeof(rpc::SnapshotHeaderT);
   106  }
   107  
   108  static void SnapshotSetup(char** argv, int argc)
   109  {
   110  	flag_snapshot = true;
   111  	// This allows to see debug output during early setup.
   112  	// If debug is not actually enabled, it will be turned off in parse_handshake.
   113  	flag_debug = true;
   114  #if GOOS_linux
   115  	// In snapshot mode executor output is redirected to /dev/kmsg.
   116  	// This is required to turn off rate limiting of writes.
   117  	write_file("/proc/sys/kernel/printk_devkmsg", "on\n");
   118  #endif
   119  	FindIvshmemDevices();
   120  	// Wait for the host to write handshake_req into input memory.
   121  	while (ivs.hdr->state != rpc::SnapshotState::Handshake)
   122  		sleep_ms(10);
   123  	auto msg = flatbuffers::GetRoot<rpc::SnapshotHandshake>(ivs.input);
   124  	handshake_req req = {
   125  	    .magic = kInMagic,
   126  	    .use_cover_edges = msg->cover_edges(),
   127  	    .is_kernel_64_bit = msg->kernel_64_bit(),
   128  	    .flags = msg->env_flags(),
   129  	    .pid = 0,
   130  	    .sandbox_arg = static_cast<uint64>(msg->sandbox_arg()),
   131  	    .syscall_timeout_ms = static_cast<uint64>(msg->syscall_timeout_ms()),
   132  	    .program_timeout_ms = static_cast<uint64>(msg->program_timeout_ms()),
   133  	    .slowdown_scale = static_cast<uint64>(msg->slowdown()),
   134  	};
   135  	parse_handshake(req);
   136  #if SYZ_HAVE_FEATURES
   137  	setup_sysctl();
   138  	setup_cgroups();
   139  #endif
   140  #if SYZ_HAVE_SETUP_EXT
   141  	// This can be defined in common_ext.h.
   142  	setup_ext();
   143  #endif
   144  	for (const auto& feat : features) {
   145  		if (!(msg->features() & feat.id))
   146  			continue;
   147  		debug("setting up feature %s\n", rpc::EnumNameFeature(feat.id));
   148  		const char* reason = feat.setup();
   149  		if (reason)
   150  			failmsg("feature setup failed", "reason: %s", reason);
   151  	}
   152  }
   153  
   154  constexpr size_t kOutputPopulate = 256 << 10;
   155  constexpr size_t kInputPopulate = 64 << 10;
   156  constexpr size_t kGlobalsPopulate = 4 << 10;
   157  constexpr size_t kDataPopulate = 8 << 10;
   158  constexpr size_t kCoveragePopulate = 64 << 10;
   159  constexpr size_t kThreadsPopulate = 2;
   160  
   161  static void SnapshotSetState(rpc::SnapshotState state)
   162  {
   163  	debug("changing stapshot state %s -> %s\n",
   164  	      rpc::EnumNameSnapshotState(ivs.hdr->state), rpc::EnumNameSnapshotState(state));
   165  	std::atomic_signal_fence(std::memory_order_seq_cst);
   166  	ivs.hdr->state = state;
   167  	// The register contains VM index shifted by 16 (the host part is VM index 1)
   168  	// + interrup vector index (0 in our case).
   169  	*ivs.doorbell = 1 << 16;
   170  }
   171  
   172  // PopulateMemory prefaults anon memory (we want to avoid minor page faults as well).
   173  static void PopulateMemory(void* ptr, size_t size)
   174  {
   175  	ptr = (void*)(uintptr_t(ptr) & ~(getpagesize() - 1));
   176  	if (madvise(ptr, size, MADV_POPULATE_WRITE))
   177  		failmsg("populate madvise failed", "ptr=%p size=%zu", ptr, size);
   178  }
   179  
   180  // TouchMemory prefaults non-anon shared memory.
   181  static void TouchMemory(void* ptr, size_t size)
   182  {
   183  	size_t const kPageSize = getpagesize();
   184  	for (size_t i = 0; i < size; i += kPageSize)
   185  		(void)((volatile char*)ptr)[i];
   186  }
   187  
   188  #if SYZ_EXECUTOR_USES_FORK_SERVER
   189  static void SnapshotPrepareParent()
   190  {
   191  	// This allows access to the output region.
   192  	CoverAccessScope scope(nullptr);
   193  	TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate);
   194  	// Notify SnapshotStart that we finished prefaulting memory in the parent.
   195  	output_data->completed = 1;
   196  	// Wait for the request to come, so that we give it full time slice to execute.
   197  	// This process will start waiting for the child as soon as we return.
   198  	while (ivs.hdr->state != rpc::SnapshotState::Execute)
   199  		;
   200  }
   201  #endif
   202  
   203  static void SnapshotStart()
   204  {
   205  	debug("SnapshotStart\n");
   206  	CoverAccessScope scope(nullptr);
   207  	// Prefault as much memory as we can before the snapshot is taken.
   208  	// Also pre-create some threads and let them block.
   209  	// This is intended to make execution after each snapshot restore faster,
   210  	// as we won't need to do that duplicate work again and again.
   211  	flag_threaded = true;
   212  	for (size_t i = 0; i < kThreadsPopulate; i++) {
   213  		thread_t* th = &threads[i];
   214  		thread_create(th, i, flag_coverage);
   215  		if (flag_coverage)
   216  			PopulateMemory(th->cov.data, kCoveragePopulate);
   217  	}
   218  	TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate);
   219  	TouchMemory(ivs.input, kInputPopulate);
   220  	PopulateMemory(&flag_coverage, kGlobalsPopulate);
   221  	PopulateMemory((void*)SYZ_DATA_OFFSET, kDataPopulate);
   222  	sleep_ms(100); // let threads start and block
   223  	// Wait for the parent process to prefault as well.
   224  	while (!output_data->completed)
   225  		sleep_ms(1);
   226  	// Notify host that we are ready to be snapshotted.
   227  	SnapshotSetState(rpc::SnapshotState::Ready);
   228  	// Snapshot is restored here.
   229  	// First time we may loop here while the snapshot is taken,
   230  	// but afterwards we should be restored when the state is already Execute.
   231  	// Note: we don't use sleep in the loop because we may be snapshotted while in the sleep syscall.
   232  	// As the result each execution after snapshot restore will be slower as it will need to finish
   233  	// the sleep and return from the syscall.
   234  	while (ivs.hdr->state == rpc::SnapshotState::Ready)
   235  		;
   236  	if (ivs.hdr->state == rpc::SnapshotState::Snapshotted) {
   237  		// First time around, just acknowledge and wait for snapshot restart.
   238  		SnapshotSetState(rpc::SnapshotState::Executed);
   239  		for (;;)
   240  			sleep(1000);
   241  	}
   242  	// Resumed for program execution.
   243  	output_data->Reset();
   244  	auto msg = flatbuffers::GetRoot<rpc::SnapshotRequest>(ivs.input);
   245  	execute_req req = {
   246  	    .magic = kInMagic,
   247  	    .id = 0,
   248  	    .type = rpc::RequestType::Program,
   249  	    .exec_flags = static_cast<uint64>(msg->exec_flags()),
   250  	    .all_call_signal = msg->all_call_signal(),
   251  	    .all_extra_signal = msg->all_extra_signal(),
   252  	};
   253  	parse_execute(req);
   254  	output_data->num_calls.store(msg->num_calls(), std::memory_order_relaxed);
   255  	input_data = const_cast<uint8*>(msg->prog_data()->Data());
   256  }
   257  
   258  NORETURN static void SnapshotDone(bool failed)
   259  {
   260  	debug("SnapshotDone\n");
   261  	CoverAccessScope scope(nullptr);
   262  	uint32 num_calls = output_data->num_calls.load(std::memory_order_relaxed);
   263  	auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, false, nullptr);
   264  	ivs.hdr->output_offset = data.data() - reinterpret_cast<volatile uint8_t*>(ivs.hdr);
   265  	ivs.hdr->output_size = data.size();
   266  	SnapshotSetState(failed ? rpc::SnapshotState::Failed : rpc::SnapshotState::Executed);
   267  	// Wait to be restarted from the snapshot.
   268  	for (;;)
   269  		sleep(1000);
   270  }