qemu/kvm架构
cpu虚拟化的示例
Reference: kvmtest.c [LWN.net]
主要步骤:
- QEMU通过/dev/kvm设备文件发起KVM_CREATE_VM ioctl,请求KVM创建一个虚拟机。KVM创建虚拟机相应的结构体,并为QEMU返回一个虚拟机文件描述符
- QEMU通过虚拟机文件描述符发起KVM_CREATE_VCPU ioctl,请求KVM创建一个vCPU。KVM创建vCPU相应的结构体并初始化,返回一个vCPU文件描述符。
- QEMU通过vCPU文件描述符发起KVM_RUN ioctl,vCPU线程执行VMLAUNCH指令进入非根模式,执行虚拟机代码直至发生VM-Exit。
- KVM根据VM-Exit的原因进行相应处理,如果与IO有关,则需要进一步返回到QEMU中进行处理。
运行结果:
代码实现:
/* Sample code for /dev/kvm API */#include <err.h>
#include <fcntl.h>
#include <linux/kvm.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>int main(void)
{int kvm, vmfd, vcpufd, ret;const uint8_t code[] = {/* 写入指定端口 0x3f8,输出 Hello */0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */0x00, 0xd8, /* add %bl, %al */0x04, '0', /* add $'0', %al */0xee, /* out %al, (%dx) */0xb0, '\n', /* mov $'\n', %al */0xee, /* out %al, (%dx) */0xb0, 'H', /* mov $'H', %al */0xee, /* out %al, (%dx) */0xb0, 'e', /* mov $'e', %al */0xee, /* out %al, (%dx) */0xb0, 'l', /* mov $'l', %al */0xee, /* out %al, (%dx) */0xb0, 'l', /* mov $'l', %al */0xee, /* out %al, (%dx) */0xb0, 'o', /* mov $'o', %al */0xee, /* out %al, (%dx) */0xb0, '\n', /* mov $'\n', %al */0xee, /* out %al, (%dx) */0xf4, /* hlt */};uint8_t *mem;struct kvm_sregs sregs;size_t mmap_size;struct kvm_run *run;// ** step 1. 打开 KVM 模块设备文件kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC);if (kvm == -1)err(1, "/dev/kvm");// 获取 KVM API 版本/* Make sure we have the stable version of the API */ret = ioctl(kvm, KVM_GET_API_VERSION, NULL);if (ret == -1)err(1, "KVM_GET_API_VERSION");if (ret != 12)errx(1, "KVM_GET_API_VERSION %d, expected 12", ret);// ** step 2. KVM_CREATE_VM 创建虚拟机获得虚拟机文件描述符vmfd = ioctl(kvm, KVM_CREATE_VM, (unsigned long)0);if (vmfd == -1)err(1, "KVM_CREATE_VM");// 分配 4KB 内存空间存放二进制代码// 这里的 0x1000(HVA)/* Allocate one aligned page of guest memory to hold the code. */mem = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);if (!mem)err(1, "allocating guest memory");// 将二进制代码复制至分配的内存页中memcpy(mem, code, sizeof(code));// KVM_SET_USER_MEMORY_REGION 将该内存页映射至虚拟机物理地址 0x1000(GPA) 处/* Map it to the second page frame (to avoid the real-mode IDT at 0). */struct kvm_userspace_memory_region region = {.slot = 0,.guest_phys_addr = 0x1000,.memory_size = 0x1000,.userspace_addr = (uint64_t)mem,};ret = ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion);if (ret == -1)err(1, "KVM_SET_USER_MEMORY_REGION");// ** step 3. KVM_CREATE_VCPU 创建 vCPU 获得 vCPU 文件描述符vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, (unsigned long)0);if (vcpufd == -1)err(1, "KVM_CREATE_VCPU");// ** step 4. 获取 QEMU/KVM 共享内存空间大小,并映射 kvm_run 结构体/* Map the shared kvm_run structure and following data. */ret = ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL);if (ret == -1)err(1, "KVM_GET_VCPU_MMAP_SIZE");mmap_size = ret;if (mmap_size < sizeof(*run))errx(1, "KVM_GET_VCPU_MMAP_SIZE unexpectedly small");// 使用 vCPU 文件描述符// 映射 kvm_run 结构体run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0);if (!run)err(1, "mmap vcpu");// ** step 5. 设置 CS 寄存器和 RIP 寄存器,使得 vCPU 从 0x1000 处开始执行/* Initialize CS to point at 0, via a read-modify-write of sregs. */ret = ioctl(vcpufd, KVM_GET_SREGS, &sregs);if (ret == -1)err(1, "KVM_GET_SREGS");sregs.cs.base = 0;sregs.cs.selector = 0;ret = ioctl(vcpufd, KVM_SET_SREGS, &sregs);if (ret == -1)err(1, "KVM_SET_SREGS");/* Initialize registers: instruction pointer for our code, addends, and* initial flags required by x86 architecture. */struct kvm_regs regs = {.rip = 0x1000,.rax = 2,.rbx = 2,.rflags = 0x2,};ret = ioctl(vcpufd, KVM_SET_REGS, ®s);if (ret == -1)err(1, "KVM_SET_REGS");/* Repeatedly run code and handle VM exits. */while (1) {// ** step 6. KVM_RUN 运行 vCPUret = ioctl(vcpufd, KVM_RUN, NULL);if (ret == -1)err(1, "KVM_RUN");// ** step 7. 处理 VM-Exitswitch (run->exit_reason) {case KVM_EXIT_HLT: // hlt 指令触发 VM-Exitputs("KVM_EXIT_HLT");return 0; // 退出程序case KVM_EXIT_IO: // 依次调用 out 指令向 0x3f8 端口写入字符时,会触发 VM-Exit,使得程序返回到用户态处理// 输出写入 0x3f8 端口的字符if (run->io.direction == KVM_EXIT_IO_OUT && run->io.size == 1 && run->io.port == 0x3f8 && run->io.count == 1)// 调用 putchar 函数输出字符putchar(*(((char *)run) + run->io.data_offset));elseerrx(1, "unhandled KVM_EXIT_IO");break;case KVM_EXIT_FAIL_ENTRY:errx(1, "KVM_EXIT_FAIL_ENTRY: hardware_entry_failure_reason = 0x%llx",(unsigned long long)run->fail_entry.hardware_entry_failure_reason);case KVM_EXIT_INTERNAL_ERROR:errx(1, "KVM_EXIT_INTERNAL_ERROR: suberror = 0x%x", run->internal.suberror);default:errx(1, "exit_reason = 0x%x", run->exit_reason);}}
}
KVM API
/usr/include/linux/kvm.h
ioctl | KVM API | Description | Example |
---|---|---|---|
ioctls for /dev/kvm fds | KVM_GET_API_VERSION | 获取 KVM API 版本 | kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC) ret = ioctl(kvm, KVM_GET_API_VERSION, NULL) |
KVM_CREATE_VM | 创建虚拟机获得虚拟机文件描述符 | vmfd = ioctl(kvm, KVM_CREATE_VM, 0) | |
ioctls for VM fds | KVM_SET_USER_MEMORY_REGION | 将内存页映射至虚拟机物理地址处 | ret = ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion) |
KVM_CREATE_VCPU | 创建 vCPU 获得 vCPU 文件描述符 | vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0) | |
KVM_GET_VCPU_MMAP_SIZE | 获取 QEMU/KVM 共享内存空间大小 | mmap_size = ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL); 使用 vCPU 文件描述符,映射 kvm_run 结构体 run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0) | |
ioctls for vcpu fds | KVM_GET_SREGS | 获取 CS 寄存器和 RIP 寄存器 | ret = ioctl(vcpufd, KVM_GET_SREGS, &sregs) |
KVM_SET_SREGS | 设置 CS 寄存器和 RIP 寄存器 | sregs.cs.base = 0 sregs.cs.selector = 0 ret = ioctl(vcpufd, KVM_SET_SREGS, &sregs) struct kvm_regs regs = { .rip = 0x1000, .rax = 2, .rbx = 2, .rflags = 0x2, } ret = ioctl(vcpufd, KVM_SET_REGS, ®s) | |
KVM_RUN | 运行 vCPU | ret = ioctl(vcpufd, KVM_RUN, NULL) |