Measuring userfaultfd page-fault latency
The userfaultfd feature in the Linux kernel allows userspace to handle page faults and some other memory management tasks. For example a missing page can be handled by paging in from a remote source, or write-protecting pages and handling write events. The initial user of this feature is QEMU post-copy live migration where a live VM running on a destination node is demand paging-in guest memory, and QEMU is handling the network transfer.
The feature hasn’t been around a long time, and there isn’t a whole lot of information out there. I predict that some of the most interesting system software yet to be written will utilize userfaultfd in novel ways.
I was curious what sort of page fault latency to expect. The following program is effectively a hello world userfaultfd program. It allocates a bunch of pages and then reads from each page. A separate thread handles the page faults, and we record the latency of each memory access. I ran this on an idle x86 desktop by generating 100K page faults with an average latency of 7.8 microseconds.
Here are a few resources that were very helpful.
- https://www.cons.org/cracauer/cracauer-userfaultfd.html
- https://github.com/torvalds/linux/blob/master/Documentation/vm/userfaultfd.txt
- https://github.com/torvalds/linux/blob/master/include/uapi/linux/userfaultfd.h
- https://www.youtube.com/watch?v=xhOBc5L_Hkk
#include <linux/userfaultfd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <pthread.h>
#include <poll.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
static volatile int stop;
struct params {
int uffd;
long page_size;
};
static inline uint64_t getns(void)
{
struct timespec ts;
int ret = clock_gettime(CLOCK_MONOTONIC, &ts);
assert(ret == 0);
return (((uint64_t)ts.tv_sec) * 1000000000ULL) + ts.tv_nsec;
}
static long get_page_size(void)
{
long ret = sysconf(_SC_PAGESIZE);
if (ret == -1) {
perror("sysconf/pagesize");
exit(1);
}
assert(ret > 0);
return ret;
}
static void *handler(void *arg)
{
struct params *p = arg;
long page_size = p->page_size;
char buf[page_size];
for (;;) {
struct uffd_msg msg;
struct pollfd pollfd[1];
pollfd[0].fd = p->uffd;
pollfd[0].events = POLLIN;
// wait for a userfaultfd event to occur
int pollres = poll(pollfd, 1, 2000);
if (stop)
return NULL;
switch (pollres) {
case -1:
perror("poll/userfaultfd");
continue;
case 0:
continue;
case 1:
break;
default:
fprintf(stderr, "unexpected poll result\n");
exit(1);
}
if (pollfd[0].revents & POLLERR) {
fprintf(stderr, "pollerr\n");
exit(1);
}
if (!pollfd[0].revents & POLLIN) {
continue;
}
int readres = read(p->uffd, &msg, sizeof(msg));
if (readres == -1) {
if (errno == EAGAIN)
continue;
perror("read/userfaultfd");
exit(1);
}
if (readres != sizeof(msg)) {
fprintf(stderr, "invalid msg size\n");
exit(1);
}
// handle the page fault by copying a page worth of bytes
if (msg.event & UFFD_EVENT_PAGEFAULT) {
long long addr = msg.arg.pagefault.address;
struct uffdio_copy copy;
copy.src = (long long)buf;
copy.dst = (long long)addr;
copy.len = page_size;
copy.mode = 0;
if (ioctl(p->uffd, UFFDIO_COPY, ©) == -1) {
perror("ioctl/copy");
exit(1);
}
}
}
return NULL;
}
int main(int argc, char **argv)
{
int uffd;
long page_size;
long num_pages;
void *region;
pthread_t uffd_thread;
page_size = get_page_size();
num_pages = 100000;
// open the userfault fd
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1) {
perror("syscall/userfaultfd");
exit(1);
}
// enable for api version and check features
struct uffdio_api uffdio_api;
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
perror("ioctl/uffdio_api");
exit(1);
}
if (uffdio_api.api != UFFD_API) {
fprintf(stderr, "unsupported userfaultfd api\n");
exit(1);
}
// allocate a memory region to be managed by userfaultfd
region = mmap(NULL, page_size * num_pages, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (region == MAP_FAILED) {
perror("mmap");
exit(1);
}
// register the pages in the region for missing callbacks
struct uffdio_register uffdio_register;
uffdio_register.range.start = (unsigned long)region;
uffdio_register.range.len = page_size * num_pages;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
perror("ioctl/uffdio_register");
exit(1);
}
if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) !=
UFFD_API_RANGE_IOCTLS) {
fprintf(stderr, "unexpected userfaultfd ioctl set\n");
exit(1);
}
// start the thread that will handle userfaultfd events
stop = 0;
struct params p;
p.uffd = uffd;
p.page_size = page_size;
pthread_create(&uffd_thread, NULL, handler, &p);
sleep(1);
// track the latencies for each page
uint64_t *latencies = malloc(sizeof(uint64_t) * num_pages);
assert(latencies);
memset(latencies, 0, sizeof(uint64_t) * num_pages);
// touch each page in the region
int value;
char *cur = region;
for (long i = 0; i < num_pages; i++) {
uint64_t start = getns();
int v = *((int*)cur);
uint64_t dur = getns() - start;
latencies[i] = dur;
value += v;
cur += page_size;
}
stop = 1;
pthread_join(uffd_thread, NULL);
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
fprintf(stderr, "ioctl unregister failure\n");
return 1;
}
for (long i = 0; i < num_pages; i++) {
fprintf(stdout, "%llu\n", (unsigned long long)latencies[i]);
}
free(latencies);
munmap(region, page_size * num_pages);
return 0;
}