系列文章目录
第十六章 QEMU系统仿真的预配置分析
文章目录
- 系列文章目录
- 第十六章 QEMU系统仿真的预配置分析
- 前言
- 一、QEMU是什么?
- 二、QEMU系统仿真的启动分析
- 1.系统仿真的初始化代码
- 2.主循环数据初始化
- 3. qmp_x_exit_preconfig()
- qemu_init_board()
- qemu_create_cli_devices()
- qemu_machine_creation_done()
- load_snapshot()
- replay_vmstate_init()
- qmp_migrate_incoming()
- qmp_cont()
- 总结
前言
本文以 QEMU 8.2.2 为例,分析其作为系统仿真工具的启动过程,并为读者展示各种 QEMU 系统仿真的启动配置实例。
本文读者需要具备一定的 QEMU 系统仿真使用经验,并对 C 语言编程有一定了解。
一、QEMU是什么?
QEMU 是一个通用且开源的机器模拟器和虚拟机。
其官方主页是:https://www.qemu.org/
二、QEMU系统仿真的启动分析
1.系统仿真的初始化代码
QEMU 作为系统仿真工具,其入口代码在 system/main.c 文件中,初始化函数 qemu_init() 的实现在 system/vl.c 文件中,在完成 QEMU 虚拟机导出信息的设置,接下来将处理预配置的工作,本篇文章将完成以下代码部分的分析。
2.主循环数据初始化
这部分代码在 system/vl.c 文件中,实现如下:
void qemu_init(int argc, char **argv)
{
...if (!preconfig_requested) {qmp_x_exit_preconfig(&error_fatal);}
...
}
3. qmp_x_exit_preconfig()
此函数在 /system/vl.c 文件中,定义如下:
void qmp_x_exit_preconfig(Error **errp)
{if (phase_check(PHASE_MACHINE_INITIALIZED)) {error_setg(errp, "The command is permitted only before machine initialization");return;}qemu_init_board();qemu_create_cli_devices();qemu_machine_creation_done();if (loadvm) {load_snapshot(loadvm, NULL, false, NULL, &error_fatal);}if (replay_mode != REPLAY_MODE_NONE) {replay_vmstate_init();}if (incoming) {Error *local_err = NULL;if (strcmp(incoming, "defer") != 0) {qmp_migrate_incoming(incoming, false, NULL, &local_err);if (local_err) {error_reportf_err(local_err, "-incoming %s: ", incoming);exit(1);}}} else if (autostart) {qmp_cont(NULL);}
}
qemu_init_board()
函数 qemu_init_board() 定义如下:
static void qemu_init_board(void)
{/* process plugin before CPUs are created, but once -smp has been parsed */qemu_plugin_load_list(&plugin_list, &error_fatal);/* From here on we enter MACHINE_PHASE_INITIALIZED. */machine_run_board_init(current_machine, mem_path, &error_fatal);drive_check_orphaned();realtime_init();
}
函数 machine_run_board_init() 定义如下:
void machine_run_board_init(MachineState *machine, const char *mem_path, Error **errp)
{ERRP_GUARD();MachineClass *machine_class = MACHINE_GET_CLASS(machine);ObjectClass *oc = object_class_by_name(machine->cpu_type);CPUClass *cc;/* This checkpoint is required by replay to separate prior clockreading from the other reads, because timer polling functions queryclock values from the log. */replay_checkpoint(CHECKPOINT_INIT);if (!xen_enabled()) {/* On 32-bit hosts, QEMU is limited by virtual address space */if (machine->ram_size > (2047 << 20) && HOST_LONG_BITS == 32) {error_setg(errp, "at most 2047 MB RAM can be simulated");return;}}if (machine->memdev) {ram_addr_t backend_size = object_property_get_uint(OBJECT(machine->memdev),"size", &error_abort);if (backend_size != machine->ram_size) {error_setg(errp, "Machine memory size does not match the size of the memory backend");return;}} else if (machine_class->default_ram_id && machine->ram_size &&numa_uses_legacy_mem()) {if (object_property_find(object_get_objects_root(),machine_class->default_ram_id)) {error_setg(errp, "object's id '%s' is reserved for the default"" RAM backend, it can't be used for any other purposes",machine_class->default_ram_id);error_append_hint(errp,"Change the object's 'id' to something else or disable"" automatic creation of the default RAM backend by setting"" 'memory-backend=%s' with '-machine'.\n",machine_class->default_ram_id);return;}if (!create_default_memdev(current_machine, mem_path, errp)) {return;}}if (machine->numa_state) {numa_complete_configuration(machine);if (machine->numa_state->num_nodes) {machine_numa_finish_cpu_init(machine);if (machine_class->cpu_cluster_has_numa_boundary) {validate_cpu_cluster_to_numa_boundary(machine);}}}if (!machine->ram && machine->memdev) {machine->ram = machine_consume_memdev(machine, machine->memdev);}/* If the machine supports the valid_cpu_types check and the user* specified a CPU with -cpu check here that the user CPU is supported.*/if (machine_class->valid_cpu_types && machine->cpu_type) {int i;for (i = 0; machine_class->valid_cpu_types[i]; i++) {if (object_class_dynamic_cast(oc,machine_class->valid_cpu_types[i])) {/* The user specified CPU is in the valid field, we are* good to go.*/break;}}if (!machine_class->valid_cpu_types[i]) {/* The user specified CPU is not valid */error_report("Invalid CPU type: %s", machine->cpu_type);error_printf("The valid types are: %s",machine_class->valid_cpu_types[0]);for (i = 1; machine_class->valid_cpu_types[i]; i++) {error_printf(", %s", machine_class->valid_cpu_types[i]);}error_printf("\n");exit(1);}}/* Check if CPU type is deprecated and warn if so */cc = CPU_CLASS(oc);if (cc && cc->deprecation_note) {warn_report("CPU model %s is deprecated -- %s", machine->cpu_type,cc->deprecation_note);}if (machine->cgs) {/** With confidential guests, the host can't see the real* contents of RAM, so there's no point in it trying to merge* areas.*/machine_set_mem_merge(OBJECT(machine), false, &error_abort);/** Virtio devices can't count on directly accessing guest* memory, so they need iommu_platform=on to use normal DMA* mechanisms. That requires also disabling legacy virtio* support for those virtio pci devices which allow it.*/object_register_sugar_prop(TYPE_VIRTIO_PCI, "disable-legacy","on", true);object_register_sugar_prop(TYPE_VIRTIO_DEVICE, "iommu_platform","on", false);}accel_init_interfaces(ACCEL_GET_CLASS(machine->accelerator));machine_class->init(machine);phase_advance(PHASE_MACHINE_INITIALIZED);
}
函数 replay_checkpoint() 定义如下:
bool replay_checkpoint(ReplayCheckpoint checkpoint)
{assert(EVENT_CHECKPOINT + checkpoint <= EVENT_CHECKPOINT_LAST);replay_save_instructions();if (replay_mode == REPLAY_MODE_PLAY) {g_assert(replay_mutex_locked());if (replay_next_event_is(EVENT_CHECKPOINT + checkpoint)) {replay_finish_event();} else {return false;}} else if (replay_mode == REPLAY_MODE_RECORD) {g_assert(replay_mutex_locked());replay_put_event(EVENT_CHECKPOINT + checkpoint);}return true;
}
函数 replay_save_instructions() 定义如下:
/*! Saves cached instructions. */
void replay_save_instructions(void)
{if (replay_file && replay_mode == REPLAY_MODE_RECORD) {g_assert(replay_mutex_locked());replay_advance_current_icount(replay_get_current_icount());}
}
函数 replay_advance_current_icount() 定义如下:
void replay_advance_current_icount(uint64_t current_icount)
{int diff = (int)(current_icount - replay_state.current_icount);/* Time can only go forward */assert(diff >= 0);if (replay_mode == REPLAY_MODE_RECORD) {if (diff > 0) {replay_put_event(EVENT_INSTRUCTION);replay_put_dword(diff);replay_state.current_icount += diff;}} else if (replay_mode == REPLAY_MODE_PLAY) {if (diff > 0) {replay_state.instruction_count -= diff;replay_state.current_icount += diff;if (replay_state.instruction_count == 0) {assert(replay_state.data_kind == EVENT_INSTRUCTION);replay_finish_event();/* Wake up iothread. This is required becausetimers will not expire until clock counterswill be read from the log. */qemu_notify_event();}}/* Execution reached the break step */if (replay_break_icount == replay_state.current_icount) {/* Cannot make callback directly from the vCPU thread */timer_mod_ns(replay_break_timer,qemu_clock_get_ns(QEMU_CLOCK_REALTIME));}}
}
函数 replay_advance_current_icount() 定义如下:
static bool create_default_memdev(MachineState *ms, const char *path, Error **errp)
{Object *obj;MachineClass *mc = MACHINE_GET_CLASS(ms);bool r = false;obj = object_new(path ? TYPE_MEMORY_BACKEND_FILE : TYPE_MEMORY_BACKEND_RAM);if (path) {if (!object_property_set_str(obj, "mem-path", path, errp)) {goto out;}}if (!object_property_set_int(obj, "size", ms->ram_size, errp)) {goto out;}object_property_add_child(object_get_objects_root(), mc->default_ram_id,obj);/* Ensure backend's memory region name is equal to mc->default_ram_id */if (!object_property_set_bool(obj, "x-use-canonical-path-for-ramblock-id",false, errp)) {goto out;}if (!user_creatable_complete(USER_CREATABLE(obj), errp)) {goto out;}r = object_property_set_link(OBJECT(ms), "memory-backend", obj, errp);out:object_unref(obj);return r;
}
函数 numa_complete_configuration() 定义如下:
void numa_complete_configuration(MachineState *ms)
{int i;MachineClass *mc = MACHINE_GET_CLASS(ms);NodeInfo *numa_info = ms->numa_state->nodes;/** If memory hotplug is enabled (slot > 0) or memory devices are enabled* (ms->maxram_size > ms->ram_size) but without '-numa' options explicitly on* CLI, guests will break.** Windows: won't enable memory hotplug without SRAT table at all** Linux: if QEMU is started with initial memory all below 4Gb* and no SRAT table present, guest kernel will use nommu DMA ops,* which breaks 32bit hw drivers when memory is hotplugged and* guest tries to use it with that drivers.** Enable NUMA implicitly by adding a new NUMA node automatically.** Or if MachineClass::auto_enable_numa is true and no NUMA nodes,* assume there is just one node with whole RAM.*/if (ms->numa_state->num_nodes == 0 &&((ms->ram_slots && mc->auto_enable_numa_with_memhp) ||(ms->maxram_size > ms->ram_size && mc->auto_enable_numa_with_memdev) ||mc->auto_enable_numa)) {NumaNodeOptions node = { };parse_numa_node(ms, &node, &error_abort);numa_info[0].node_mem = ms->ram_size;}assert(max_numa_nodeid <= MAX_NODES);/* No support for sparse NUMA node IDs yet: */for (i = max_numa_nodeid - 1; i >= 0; i--) {/* Report large node IDs first, to make mistakes easier to spot */if (!numa_info[i].present) {error_report("numa: Node ID missing: %d", i);exit(1);}}/* This must be always true if all nodes are present: */assert(ms->numa_state->num_nodes == max_numa_nodeid);if (ms->numa_state->num_nodes > 0) {uint64_t numa_total;numa_total = 0;for (i = 0; i < ms->numa_state->num_nodes; i++) {numa_total += numa_info[i].node_mem;}if (numa_total != ms->ram_size) {error_report("total memory for NUMA nodes (0x%" PRIx64 ")"" should equal RAM size (0x" RAM_ADDR_FMT ")",numa_total, ms->ram_size);exit(1);}if (!numa_uses_legacy_mem() && mc->default_ram_id) {if (ms->memdev) {error_report("'-machine memory-backend' and '-numa memdev'"" properties are mutually exclusive");exit(1);}ms->ram = g_new(MemoryRegion, 1);memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id,ms->ram_size);numa_init_memdev_container(ms, ms->ram);}/* QEMU needs at least all unique node pair distances to build* the whole NUMA distance table. QEMU treats the distance table* as symmetric by default, i.e. distance A->B == distance B->A.* Thus, QEMU is able to complete the distance table* initialization even though only distance A->B is provided and* distance B->A is not. QEMU knows the distance of a node to* itself is always 10, so A->A distances may be omitted. When* the distances of two nodes of a pair differ, i.e. distance* A->B != distance B->A, then that means the distance table is* asymmetric. In this case, the distances for both directions* of all node pairs are required.*/if (ms->numa_state->have_numa_distance) {/* Validate enough NUMA distance information was provided. */validate_numa_distance(ms);/* Validation succeeded, now fill in any missing distances. */complete_init_numa_distance(ms);}}
}
函数 numa_complete_configuration() 定义如下:
static void machine_numa_finish_cpu_init(MachineState *machine)
{int i;bool default_mapping;GString *s = g_string_new(NULL);MachineClass *mc = MACHINE_GET_CLASS(machine);const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(machine);assert(machine->numa_state->num_nodes);for (i = 0; i < possible_cpus->len; i++) {if (possible_cpus->cpus[i].props.has_node_id) {break;}}default_mapping = (i == possible_cpus->len);for (i = 0; i < possible_cpus->len; i++) {const CPUArchId *cpu_slot = &possible_cpus->cpus[i];if (!cpu_slot->props.has_node_id) {/* fetch default mapping from board and enable it */CpuInstanceProperties props = cpu_slot->props;props.node_id = mc->get_default_cpu_node_id(machine, i);if (!default_mapping) {/* record slots with not set mapping,* TODO: make it hard error in future */char *cpu_str = cpu_slot_to_string(cpu_slot);g_string_append_printf(s, "%sCPU %d [%s]",s->len ? ", " : "", i, cpu_str);g_free(cpu_str);/* non mapped cpus used to fallback to node 0 */props.node_id = 0;}props.has_node_id = true;machine_set_cpu_numa_node(machine, &props, &error_fatal);}}if (machine->numa_state->hmat_enabled) {numa_validate_initiator(machine->numa_state);}if (s->len && !qtest_enabled()) {warn_report("CPU(s) not present in any NUMA nodes: %s",s->str);warn_report("All CPU(s) up to maxcpus should be described ""in NUMA config, ability to start up with partial NUMA ""mappings is obsoleted and will be removed in future");}g_string_free(s, true);
}
函数 numa_complete_configuration() 定义如下:
MemoryRegion *machine_consume_memdev(MachineState *machine,HostMemoryBackend *backend)
{MemoryRegion *ret = host_memory_backend_get_memory(backend);if (host_memory_backend_is_mapped(backend)) {error_report("memory backend %s can't be used multiple times.",object_get_canonical_path_component(OBJECT(backend)));exit(EXIT_FAILURE);}host_memory_backend_set_mapped(backend, true);vmstate_register_ram_global(ret);return ret;
}
函数 host_memory_backend_get_memory() 定义如下:
bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
{/** NOTE: We forbid zero-length memory backend, so here zero means* "we haven't inited the backend memory region yet".*/return memory_region_size(&backend->mr) != 0;
}MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend)
{return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL;
}
函数 vmstate_register_ram_global() 定义如下:
void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
{qemu_ram_set_idstr(mr->ram_block,memory_region_name(mr), dev);qemu_ram_set_migratable(mr->ram_block);
}void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
{qemu_ram_unset_idstr(mr->ram_block);qemu_ram_unset_migratable(mr->ram_block);
}void vmstate_register_ram_global(MemoryRegion *mr)
{vmstate_register_ram(mr, NULL);
}
函数 accel_init_interfaces() 定义如下:
static void accel_init_cpu_int_aux(ObjectClass *klass, void *opaque)
{CPUClass *cc = CPU_CLASS(klass);AccelCPUClass *accel_cpu = opaque;/** The first callback allows accel-cpu to run initializations* for the CPU, customizing CPU behavior according to the accelerator.** The second one allows the CPU to customize the accel-cpu* behavior according to the CPU.** The second is currently only used by TCG, to specialize the* TCGCPUOps depending on the CPU type.*/cc->accel_cpu = accel_cpu;if (accel_cpu->cpu_class_init) {accel_cpu->cpu_class_init(cc);}if (cc->init_accel_cpu) {cc->init_accel_cpu(accel_cpu, cc);}
}/* initialize the arch-specific accel CpuClass interfaces */
static void accel_init_cpu_interfaces(AccelClass *ac)
{const char *ac_name; /* AccelClass name */char *acc_name; /* AccelCPUClass name */ObjectClass *acc; /* AccelCPUClass */ac_name = object_class_get_name(OBJECT_CLASS(ac));g_assert(ac_name != NULL);acc_name = g_strdup_printf("%s-%s", ac_name, CPU_RESOLVING_TYPE);acc = object_class_by_name(acc_name);g_free(acc_name);if (acc) {object_class_foreach(accel_init_cpu_int_aux,CPU_RESOLVING_TYPE, false, acc);}
}void accel_init_interfaces(AccelClass *ac)
{
#ifndef CONFIG_USER_ONLYaccel_init_ops_interfaces(ac);
#endif /* !CONFIG_USER_ONLY */accel_init_cpu_interfaces(ac);
}
在对 CPU 类的初始化时,根据 CPU 类型不同有不同的实现,对于 x86 ,其实现函数是 max_x86_cpu_class_init() ,定义如下:
static void max_x86_cpu_realize(DeviceState *dev, Error **errp)
{Object *obj = OBJECT(dev);if (!object_property_get_int(obj, "family", &error_abort)) {if (X86_CPU(obj)->env.features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {object_property_set_int(obj, "family", 15, &error_abort);object_property_set_int(obj, "model", 107, &error_abort);object_property_set_int(obj, "stepping", 1, &error_abort);} else {object_property_set_int(obj, "family", 6, &error_abort);object_property_set_int(obj, "model", 6, &error_abort);object_property_set_int(obj, "stepping", 3, &error_abort);}}x86_cpu_realizefn(dev, errp);
}static void max_x86_cpu_class_init(ObjectClass *oc, void *data)
{DeviceClass *dc = DEVICE_CLASS(oc);X86CPUClass *xcc = X86_CPU_CLASS(oc);xcc->ordering = 9;xcc->model_description ="Enables all features supported by the accelerator in the current host";device_class_set_props(dc, max_x86_cpu_properties);dc->realize = max_x86_cpu_realize;
}
函数 x86_cpu_realizefn() 定义如下:
static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
{CPUState *cs = CPU(dev);X86CPU *cpu = X86_CPU(dev);X86CPUClass *xcc = X86_CPU_GET_CLASS(dev);CPUX86State *env = &cpu->env;Error *local_err = NULL;static bool ht_warned;unsigned requested_lbr_fmt;/* Use pc-relative instructions in system-mode */
#ifndef CONFIG_USER_ONLYcs->tcg_cflags |= CF_PCREL;
#endifif (cpu->apic_id == UNASSIGNED_APIC_ID) {error_setg(errp, "apic-id property was not initialized properly");return;}/** Process Hyper-V enlightenments.* Note: this currently has to happen before the expansion of CPU features.*/x86_cpu_hyperv_realize(cpu);x86_cpu_expand_features(cpu, &local_err);if (local_err) {goto out;}/** Override env->features[FEAT_PERF_CAPABILITIES].LBR_FMT* with user-provided setting.*/if (cpu->lbr_fmt != ~PERF_CAP_LBR_FMT) {if ((cpu->lbr_fmt & PERF_CAP_LBR_FMT) != cpu->lbr_fmt) {error_setg(errp, "invalid lbr-fmt");return;}env->features[FEAT_PERF_CAPABILITIES] &= ~PERF_CAP_LBR_FMT;env->features[FEAT_PERF_CAPABILITIES] |= cpu->lbr_fmt;}/** vPMU LBR is supported when 1) KVM is enabled 2) Option pmu=on and* 3)vPMU LBR format matches that of host setting.*/requested_lbr_fmt =env->features[FEAT_PERF_CAPABILITIES] & PERF_CAP_LBR_FMT;if (requested_lbr_fmt && kvm_enabled()) {uint64_t host_perf_cap =x86_cpu_get_supported_feature_word(FEAT_PERF_CAPABILITIES, false);unsigned host_lbr_fmt = host_perf_cap & PERF_CAP_LBR_FMT;if (!cpu->enable_pmu) {error_setg(errp, "vPMU: LBR is unsupported without pmu=on");return;}if (requested_lbr_fmt != host_lbr_fmt) {error_setg(errp, "vPMU: the lbr-fmt value (0x%x) does not match ""the host value (0x%x).",requested_lbr_fmt, host_lbr_fmt);return;}}x86_cpu_filter_features(cpu, cpu->check_cpuid || cpu->enforce_cpuid);if (cpu->enforce_cpuid && x86_cpu_have_filtered_features(cpu)) {error_setg(&local_err,accel_uses_host_cpuid() ?"Host doesn't support requested features" :"TCG doesn't support requested features");goto out;}/* On AMD CPUs, some CPUID[8000_0001].EDX bits must match the bits on* CPUID[1].EDX.*/if (IS_AMD_CPU(env)) {env->features[FEAT_8000_0001_EDX] &= ~CPUID_EXT2_AMD_ALIASES;env->features[FEAT_8000_0001_EDX] |= (env->features[FEAT_1_EDX]& CPUID_EXT2_AMD_ALIASES);}x86_cpu_set_sgxlepubkeyhash(env);/** note: the call to the framework needs to happen after feature expansion,* but before the checks/modifications to ucode_rev, mwait, phys_bits.* These may be set by the accel-specific code,* and the results are subsequently checked / assumed in this function.*/cpu_exec_realizefn(cs, &local_err);if (local_err != NULL) {error_propagate(errp, local_err);return;}if (xcc->host_cpuid_required && !accel_uses_host_cpuid()) {g_autofree char *name = x86_cpu_class_get_model_name(xcc);error_setg(&local_err, "CPU model '%s' requires KVM or HVF", name);goto out;}if (cpu->ucode_rev == 0) {/** The default is the same as KVM's. Note that this check* needs to happen after the evenual setting of ucode_rev in* accel-specific code in cpu_exec_realizefn.*/if (IS_AMD_CPU(env)) {cpu->ucode_rev = 0x01000065;} else {cpu->ucode_rev = 0x100000000ULL;}}/** mwait extended info: needed for Core compatibility* We always wake on interrupt even if host does not have the capability.** requires the accel-specific code in cpu_exec_realizefn to* have already acquired the CPUID data into cpu->mwait.*/cpu->mwait.ecx |= CPUID_MWAIT_EMX | CPUID_MWAIT_IBE;/* For 64bit systems think about the number of physical bits to present.* ideally this should be the same as the host; anything other than matching* the host can cause incorrect guest behaviour.* QEMU used to pick the magic value of 40 bits that corresponds to* consumer AMD devices but nothing else.** Note that this code assumes features expansion has already been done* (as it checks for CPUID_EXT2_LM), and also assumes that potential* phys_bits adjustments to match the host have been already done in* accel-specific code in cpu_exec_realizefn.*/if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {if (cpu->phys_bits &&(cpu->phys_bits > TARGET_PHYS_ADDR_SPACE_BITS ||cpu->phys_bits < 32)) {error_setg(errp, "phys-bits should be between 32 and %u "" (but is %u)",TARGET_PHYS_ADDR_SPACE_BITS, cpu->phys_bits);return;}/** 0 means it was not explicitly set by the user (or by machine* compat_props or by the host code in host-cpu.c).* In this case, the default is the value used by TCG (40).*/if (cpu->phys_bits == 0) {cpu->phys_bits = TCG_PHYS_ADDR_BITS;}} else {/* For 32 bit systems don't use the user set value, but keep* phys_bits consistent with what we tell the guest.*/if (cpu->phys_bits != 0) {error_setg(errp, "phys-bits is not user-configurable in 32 bit");return;}if (env->features[FEAT_1_EDX] & (CPUID_PSE36 | CPUID_PAE)) {cpu->phys_bits = 36;} else {cpu->phys_bits = 32;}}/* Cache information initialization */if (!cpu->legacy_cache) {const CPUCaches *cache_info =x86_cpu_get_versioned_cache_info(cpu, xcc->model);if (!xcc->model || !cache_info) {g_autofree char *name = x86_cpu_class_get_model_name(xcc);error_setg(errp,"CPU model '%s' doesn't support legacy-cache=off", name);return;}env->cache_info_cpuid2 = env->cache_info_cpuid4 = env->cache_info_amd =*cache_info;} else {/* Build legacy cache information */env->cache_info_cpuid2.l1d_cache = &legacy_l1d_cache;env->cache_info_cpuid2.l1i_cache = &legacy_l1i_cache;env->cache_info_cpuid2.l2_cache = &legacy_l2_cache_cpuid2;env->cache_info_cpuid2.l3_cache = &legacy_l3_cache;env->cache_info_cpuid4.l1d_cache = &legacy_l1d_cache;env->cache_info_cpuid4.l1i_cache = &legacy_l1i_cache;env->cache_info_cpuid4.l2_cache = &legacy_l2_cache;env->cache_info_cpuid4.l3_cache = &legacy_l3_cache;env->cache_info_amd.l1d_cache = &legacy_l1d_cache_amd;env->cache_info_amd.l1i_cache = &legacy_l1i_cache_amd;env->cache_info_amd.l2_cache = &legacy_l2_cache_amd;env->cache_info_amd.l3_cache = &legacy_l3_cache;}#ifndef CONFIG_USER_ONLYMachineState *ms = MACHINE(qdev_get_machine());qemu_register_reset(x86_cpu_machine_reset_cb, cpu);if (cpu->env.features[FEAT_1_EDX] & CPUID_APIC || ms->smp.cpus > 1) {x86_cpu_apic_create(cpu, &local_err);if (local_err != NULL) {goto out;}}
#endifmce_init(cpu);qemu_init_vcpu(cs);/** Most Intel and certain AMD CPUs support hyperthreading. Even though QEMU* fixes this issue by adjusting CPUID_0000_0001_EBX and CPUID_8000_0008_ECX* based on inputs (sockets,cores,threads), it is still better to give* users a warning.** NOTE: the following code has to follow qemu_init_vcpu(). Otherwise* cs->nr_threads hasn't be populated yet and the checking is incorrect.*/if (IS_AMD_CPU(env) &&!(env->features[FEAT_8000_0001_ECX] & CPUID_EXT3_TOPOEXT) &&cs->nr_threads > 1 && !ht_warned) {warn_report("This family of AMD CPU doesn't support ""hyperthreading(%d)",cs->nr_threads);error_printf("Please configure -smp options properly"" or try enabling topoext feature.\n");ht_warned = true;}#ifndef CONFIG_USER_ONLYx86_cpu_apic_realize(cpu, &local_err);if (local_err != NULL) {goto out;}
#endif /* !CONFIG_USER_ONLY */cpu_reset(cs);xcc->parent_realize(dev, &local_err);out:if (local_err != NULL) {error_propagate(errp, local_err);return;}
}
函数 x86_cpu_hyperv_realize() 定义如下:
static void x86_cpu_hyperv_realize(X86CPU *cpu)
{size_t len;/* Hyper-V vendor id */if (!cpu->hyperv_vendor) {object_property_set_str(OBJECT(cpu), "hv-vendor-id", "Microsoft Hv",&error_abort);}len = strlen(cpu->hyperv_vendor);if (len > 12) {warn_report("hv-vendor-id truncated to 12 characters");len = 12;}memset(cpu->hyperv_vendor_id, 0, 12);memcpy(cpu->hyperv_vendor_id, cpu->hyperv_vendor, len);/* 'Hv#1' interface identification*/cpu->hyperv_interface_id[0] = 0x31237648;cpu->hyperv_interface_id[1] = 0;cpu->hyperv_interface_id[2] = 0;cpu->hyperv_interface_id[3] = 0;/* Hypervisor implementation limits */cpu->hyperv_limits[0] = 64;cpu->hyperv_limits[1] = 0;cpu->hyperv_limits[2] = 0;
}
函数 x86_cpu_expand_features() 定义如下:
/* Expand CPU configuration data, based on configured features* and host/accelerator capabilities when appropriate.*/
void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
{CPUX86State *env = &cpu->env;FeatureWord w;int i;GList *l;for (l = plus_features; l; l = l->next) {const char *prop = l->data;if (!object_property_set_bool(OBJECT(cpu), prop, true, errp)) {return;}}for (l = minus_features; l; l = l->next) {const char *prop = l->data;if (!object_property_set_bool(OBJECT(cpu), prop, false, errp)) {return;}}/*TODO: Now cpu->max_features doesn't overwrite features* set using QOM properties, and we can convert* plus_features & minus_features to global properties* inside x86_cpu_parse_featurestr() too.*/if (cpu->max_features) {for (w = 0; w < FEATURE_WORDS; w++) {/* Override only features that weren't set explicitly* by the user.*/env->features[w] |=x86_cpu_get_supported_feature_word(w, cpu->migratable) &~env->user_features[w] &~feature_word_info[w].no_autoenable_flags;}}for (i = 0; i < ARRAY_SIZE(feature_dependencies); i++) {FeatureDep *d = &feature_dependencies[i];if (!(env->features[d->from.index] & d->from.mask)) {uint64_t unavailable_features = env->features[d->to.index] & d->to.mask;/* Not an error unless the dependent feature was added explicitly. */mark_unavailable_features(cpu, d->to.index,unavailable_features & env->user_features[d->to.index],"This feature depends on other features that were not requested");env->features[d->to.index] &= ~unavailable_features;}}if (!kvm_enabled() || !cpu->expose_kvm) {env->features[FEAT_KVM] = 0;}x86_cpu_enable_xsave_components(cpu);/* CPUID[EAX=7,ECX=0].EBX always increased level automatically: */x86_cpu_adjust_feat_level(cpu, FEAT_7_0_EBX);if (cpu->full_cpuid_auto_level) {x86_cpu_adjust_feat_level(cpu, FEAT_1_EDX);x86_cpu_adjust_feat_level(cpu, FEAT_1_ECX);x86_cpu_adjust_feat_level(cpu, FEAT_6_EAX);x86_cpu_adjust_feat_level(cpu, FEAT_7_0_ECX);x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EAX);x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EDX);x86_cpu_adjust_feat_level(cpu, FEAT_7_2_EDX);x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_EDX);x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_ECX);x86_cpu_adjust_feat_level(cpu, FEAT_8000_0007_EDX);x86_cpu_adjust_feat_level(cpu, FEAT_8000_0008_EBX);x86_cpu_adjust_feat_level(cpu, FEAT_C000_0001_EDX);x86_cpu_adjust_feat_level(cpu, FEAT_SVM);x86_cpu_adjust_feat_level(cpu, FEAT_XSAVE);/* Intel Processor Trace requires CPUID[0x14] */if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT)) {if (cpu->intel_pt_auto_level) {x86_cpu_adjust_level(cpu, &cpu->env.cpuid_min_level, 0x14);} else if (cpu->env.cpuid_min_level < 0x14) {mark_unavailable_features(cpu, FEAT_7_0_EBX,CPUID_7_0_EBX_INTEL_PT,"Intel PT need CPUID leaf 0x14, please set by \"-cpu ...,intel-pt=on,min-level=0x14\"");}}/** Intel CPU topology with multi-dies support requires CPUID[0x1F].* For AMD Rome/Milan, cpuid level is 0x10, and guest OS should detect* extended toplogy by leaf 0xB. Only adjust it for Intel CPU, unless* cpu->vendor_cpuid_only has been unset for compatibility with older* machine types.*/if ((env->nr_dies > 1) &&(IS_INTEL_CPU(env) || !cpu->vendor_cpuid_only)) {x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0x1F);}/* SVM requires CPUID[0x8000000A] */if (env->features[FEAT_8000_0001_ECX] & CPUID_EXT3_SVM) {x86_cpu_adjust_level(cpu, &env->cpuid_min_xlevel, 0x8000000A);}/* SEV requires CPUID[0x8000001F] */if (sev_enabled()) {x86_cpu_adjust_level(cpu, &env->cpuid_min_xlevel, 0x8000001F);}if (env->features[FEAT_8000_0021_EAX]) {x86_cpu_adjust_level(cpu, &env->cpuid_min_xlevel, 0x80000021);}/* SGX requires CPUID[0x12] for EPC enumeration */if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_SGX) {x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0x12);}}/* Set cpuid_*level* based on cpuid_min_*level, if not explicitly set */if (env->cpuid_level_func7 == UINT32_MAX) {env->cpuid_level_func7 = env->cpuid_min_level_func7;}if (env->cpuid_level == UINT32_MAX) {env->cpuid_level = env->cpuid_min_level;}if (env->cpuid_xlevel == UINT32_MAX) {env->cpuid_xlevel = env->cpuid_min_xlevel;}if (env->cpuid_xlevel2 == UINT32_MAX) {env->cpuid_xlevel2 = env->cpuid_min_xlevel2;}if (kvm_enabled() && !kvm_hyperv_expand_features(cpu, errp)) {return;}
}
函数 x86_cpu_expand_features() 定义如下:
uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,bool migratable_only)
{FeatureWordInfo *wi = &feature_word_info[w];uint64_t r = 0;if (kvm_enabled()) {switch (wi->type) {case CPUID_FEATURE_WORD:r = kvm_arch_get_supported_cpuid(kvm_state, wi->cpuid.eax,wi->cpuid.ecx,wi->cpuid.reg);break;case MSR_FEATURE_WORD:r = kvm_arch_get_supported_msr_feature(kvm_state,wi->msr.index);break;}} else if (hvf_enabled()) {if (wi->type != CPUID_FEATURE_WORD) {return 0;}r = hvf_get_supported_cpuid(wi->cpuid.eax,wi->cpuid.ecx,wi->cpuid.reg);} else if (tcg_enabled()) {r = wi->tcg_features;} else {return ~0;}
#ifndef TARGET_X86_64if (w == FEAT_8000_0001_EDX) {/** 32-bit TCG can emulate 64-bit compatibility mode. If there is no* way for userspace to get out of its 32-bit jail, we can leave* the LM bit set.*/uint32_t unavail = tcg_enabled()? CPUID_EXT2_LM & ~CPUID_EXT2_KERNEL_FEATURES: CPUID_EXT2_LM;r &= ~unavail;}
#endifif (migratable_only) {r &= x86_cpu_get_migratable_flags(w);}return r;
}
函数 x86_cpu_filter_features() 定义如下:
/** Finishes initialization of CPUID data, filters CPU feature* words based on host availability of each feature.** Returns: 0 if all flags are supported by the host, non-zero otherwise.*/
static void x86_cpu_filter_features(X86CPU *cpu, bool verbose)
{CPUX86State *env = &cpu->env;FeatureWord w;const char *prefix = NULL;if (verbose) {prefix = accel_uses_host_cpuid()? "host doesn't support requested feature": "TCG doesn't support requested feature";}for (w = 0; w < FEATURE_WORDS; w++) {uint64_t host_feat =x86_cpu_get_supported_feature_word(w, false);uint64_t requested_features = env->features[w];uint64_t unavailable_features = requested_features & ~host_feat;mark_unavailable_features(cpu, w, unavailable_features, prefix);}if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {uint32_t eax_0, ebx_0, ecx_0, edx_0_unused;uint32_t eax_1, ebx_1, ecx_1_unused, edx_1_unused;x86_cpu_get_supported_cpuid(0x14, 0,&eax_0, &ebx_0, &ecx_0, &edx_0_unused);x86_cpu_get_supported_cpuid(0x14, 1,&eax_1, &ebx_1, &ecx_1_unused, &edx_1_unused);if (!eax_0 ||((ebx_0 & INTEL_PT_MINIMAL_EBX) != INTEL_PT_MINIMAL_EBX) ||((ecx_0 & INTEL_PT_MINIMAL_ECX) != INTEL_PT_MINIMAL_ECX) ||((eax_1 & INTEL_PT_MTC_BITMAP) != INTEL_PT_MTC_BITMAP) ||((eax_1 & INTEL_PT_ADDR_RANGES_NUM_MASK) <INTEL_PT_ADDR_RANGES_NUM) ||((ebx_1 & (INTEL_PT_PSB_BITMAP | INTEL_PT_CYCLE_BITMAP)) !=(INTEL_PT_PSB_BITMAP | INTEL_PT_CYCLE_BITMAP)) ||((ecx_0 & CPUID_14_0_ECX_LIP) !=(env->features[FEAT_14_0_ECX] & CPUID_14_0_ECX_LIP))) {/** Processor Trace capabilities aren't configurable, so if the* host can't emulate the capabilities we report on* cpu_x86_cpuid(), intel-pt can't be enabled on the current host.*/mark_unavailable_features(cpu, FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT, prefix);}}
}
函数 mark_unavailable_features() 定义如下:
static void mark_unavailable_features(X86CPU *cpu, FeatureWord w, uint64_t mask,const char *verbose_prefix)
{CPUX86State *env = &cpu->env;FeatureWordInfo *f = &feature_word_info[w];int i;if (!cpu->force_features) {env->features[w] &= ~mask;}cpu->filtered_features[w] |= mask;if (!verbose_prefix) {return;}for (i = 0; i < 64; ++i) {if ((1ULL << i) & mask) {g_autofree char *feat_word_str = feature_word_description(f, i);warn_report("%s: %s%s%s [bit %d]",verbose_prefix,feat_word_str,f->feat_names[i] ? "." : "",f->feat_names[i] ? f->feat_names[i] : "", i);}}
}
函数 x86_cpu_get_supported_cpuid() 定义如下:
static void x86_cpu_get_supported_cpuid(uint32_t func, uint32_t index,uint32_t *eax, uint32_t *ebx,uint32_t *ecx, uint32_t *edx)
{if (kvm_enabled()) {*eax = kvm_arch_get_supported_cpuid(kvm_state, func, index, R_EAX);*ebx = kvm_arch_get_supported_cpuid(kvm_state, func, index, R_EBX);*ecx = kvm_arch_get_supported_cpuid(kvm_state, func, index, R_ECX);*edx = kvm_arch_get_supported_cpuid(kvm_state, func, index, R_EDX);} else if (hvf_enabled()) {*eax = hvf_get_supported_cpuid(func, index, R_EAX);*ebx = hvf_get_supported_cpuid(func, index, R_EBX);*ecx = hvf_get_supported_cpuid(func, index, R_ECX);*edx = hvf_get_supported_cpuid(func, index, R_EDX);} else {*eax = 0;*ebx = 0;*ecx = 0;*edx = 0;}
}
函数 cpu_exec_realizefn() 定义如下:
bool cpu_exec_realizefn(CPUState *cpu, Error **errp)
{/* cache the cpu class for the hotpath */cpu->cc = CPU_GET_CLASS(cpu);if (!accel_cpu_common_realize(cpu, errp)) {return false;}/* Wait until cpu initialization complete before exposing cpu. */cpu_list_add(cpu);#ifdef CONFIG_USER_ONLYassert(qdev_get_vmsd(DEVICE(cpu)) == NULL ||qdev_get_vmsd(DEVICE(cpu))->unmigratable);
#elseif (qdev_get_vmsd(DEVICE(cpu)) == NULL) {vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);}if (cpu->cc->sysemu_ops->legacy_vmsd != NULL) {vmstate_register(NULL, cpu->cpu_index, cpu->cc->sysemu_ops->legacy_vmsd, cpu);}
#endif /* CONFIG_USER_ONLY */return true;
}
函数 cpu_exec_realizefn() 定义如下:
bool accel_cpu_common_realize(CPUState *cpu, Error **errp)
{CPUClass *cc = CPU_GET_CLASS(cpu);AccelState *accel = current_accel();AccelClass *acc = ACCEL_GET_CLASS(accel);/* target specific realization */if (cc->accel_cpu && cc->accel_cpu->cpu_target_realize&& !cc->accel_cpu->cpu_target_realize(cpu, errp)) {return false;}/* generic realization */if (acc->cpu_common_realize && !acc->cpu_common_realize(cpu, errp)) {return false;}return true;
}
加速器通用实现根据 CPU 加速器不同而不同,i386 CPU 的 TCG 加速器使用的是函数 tcg_cpu_realizefn(),定义如下:
bool tcg_cpu_realizefn(CPUState *cs, Error **errp)
{X86CPU *cpu = X86_CPU(cs);/** The realize order is important, since x86_cpu_realize() checks if* nothing else has been set by the user (or by accelerators) in* cpu->ucode_rev and cpu->phys_bits, and the memory regions* initialized here are needed for the vcpu initialization.** realize order:* tcg_cpu -> host_cpu -> x86_cpu*/cpu->cpu_as_mem = g_new(MemoryRegion, 1);cpu->cpu_as_root = g_new(MemoryRegion, 1);/* Outer container... */memory_region_init(cpu->cpu_as_root, OBJECT(cpu), "memory", ~0ull);memory_region_set_enabled(cpu->cpu_as_root, true);/** ... with two regions inside: normal system memory with low* priority, and...*/memory_region_init_alias(cpu->cpu_as_mem, OBJECT(cpu), "memory",get_system_memory(), 0, ~0ull);memory_region_add_subregion_overlap(cpu->cpu_as_root, 0, cpu->cpu_as_mem, 0);memory_region_set_enabled(cpu->cpu_as_mem, true);cs->num_ases = 2;cpu_address_space_init(cs, 0, "cpu-memory", cs->memory);cpu_address_space_init(cs, 1, "cpu-smm", cpu->cpu_as_root);/* ... SMRAM with higher priority, linked from /machine/smram. */cpu->machine_done.notify = tcg_cpu_machine_done;qemu_add_machine_init_done_notifier(&cpu->machine_done);return true;
}
函数 memory_region_init(),定义如下:
static void memory_region_do_init(MemoryRegion *mr,Object *owner,const char *name,uint64_t size)
{mr->size = int128_make64(size);if (size == UINT64_MAX) {mr->size = int128_2_64();}mr->name = g_strdup(name);mr->owner = owner;mr->dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);mr->ram_block = NULL;if (name) {char *escaped_name = memory_region_escape_name(name);char *name_array = g_strdup_printf("%s[*]", escaped_name);if (!owner) {owner = container_get(qdev_get_machine(), "/unattached");}object_property_add_child(owner, name_array, OBJECT(mr));object_unref(OBJECT(mr));g_free(name_array);g_free(escaped_name);}
}void memory_region_init(MemoryRegion *mr,Object *owner,const char *name,uint64_t size)
{object_initialize(mr, sizeof(*mr), TYPE_MEMORY_REGION);memory_region_do_init(mr, owner, name, size);
}
函数 memory_region_init_alias(),定义如下:
void memory_region_init_alias(MemoryRegion *mr,Object *owner,const char *name,MemoryRegion *orig,hwaddr offset,uint64_t size)
{memory_region_init(mr, owner, name, size);mr->alias = orig;mr->alias_offset = offset;
}
函数 cpu_address_space_init(),定义如下:
void cpu_address_space_init(CPUState *cpu, int asidx,const char *prefix, MemoryRegion *mr)
{CPUAddressSpace *newas;AddressSpace *as = g_new0(AddressSpace, 1);char *as_name;assert(mr);as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);address_space_init(as, mr, as_name);g_free(as_name);/* Target code should have set num_ases before calling us */assert(asidx < cpu->num_ases);if (asidx == 0) {/* address space 0 gets the convenience alias */cpu->as = as;}/* KVM cannot currently support multiple address spaces. */assert(asidx == 0 || !kvm_enabled());if (!cpu->cpu_ases) {cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);}newas = &cpu->cpu_ases[asidx];newas->cpu = cpu;newas->as = as;if (tcg_enabled()) {newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;newas->tcg_as_listener.commit = tcg_commit;newas->tcg_as_listener.name = "tcg";memory_listener_register(&newas->tcg_as_listener, as);}
}
函数 cpu_address_space_init(),定义如下:
void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
{memory_region_ref(root);as->root = root;as->current_map = NULL;as->ioeventfd_nb = 0;as->ioeventfds = NULL;QTAILQ_INIT(&as->listeners);QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);as->name = g_strdup(name ? name : "anonymous");address_space_update_topology(as);address_space_update_ioeventfds(as);
}
函数 cpu_address_space_init(),定义如下:
static void address_space_update_topology(AddressSpace *as)
{MemoryRegion *physmr = memory_region_get_flatview_root(as->root);flatviews_init();if (!g_hash_table_lookup(flat_views, physmr)) {generate_memory_topology(physmr);}address_space_set_flatview(as);
}
函数 memory_region_get_flatview_root(),定义如下:
static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr)
{while (mr->enabled) {if (mr->alias) {if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size)) {/* The alias is included in its entirety. Use it as* the "real" root, so that we can share more FlatViews.*/mr = mr->alias;continue;}} else if (!mr->terminates) {unsigned int found = 0;MemoryRegion *child, *next = NULL;QTAILQ_FOREACH(child, &mr->subregions, subregions_link) {if (child->enabled) {if (++found > 1) {next = NULL;break;}if (!child->addr && int128_ge(mr->size, child->size)) {/* A child is included in its entirety. If it's the only* enabled one, use it in the hope of finding an alias down the* way. This will also let us share FlatViews.*/next = child;}}}if (found == 0) {return NULL;}if (next) {mr = next;continue;}}return mr;}return NULL;
}
函数 flatviews_init(),定义如下:
static void flatviews_init(void)
{static FlatView *empty_view;if (flat_views) {return;}flat_views = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL,(GDestroyNotify) flatview_unref);if (!empty_view) {empty_view = generate_memory_topology(NULL);/* We keep it alive forever in the global variable. */flatview_ref(empty_view);} else {g_hash_table_replace(flat_views, NULL, empty_view);flatview_ref(empty_view);}
}
函数 generate_memory_topology(),定义如下:
/* Render a memory topology into a list of disjoint absolute ranges. */
static FlatView *generate_memory_topology(MemoryRegion *mr)
{int i;FlatView *view;view = flatview_new(mr);if (mr) {render_memory_region(view, mr, int128_zero(),addrrange_make(int128_zero(), int128_2_64()),false, false, false);}flatview_simplify(view);view->dispatch = address_space_dispatch_new(view);for (i = 0; i < view->nr; i++) {MemoryRegionSection mrs =section_from_flat_range(&view->ranges[i], view);flatview_add_to_dispatch(view, &mrs);}address_space_dispatch_compact(view->dispatch);g_hash_table_replace(flat_views, mr, view);return view;
}
函数 flatview_new(),定义如下:
static FlatView *flatview_new(MemoryRegion *mr_root)
{FlatView *view;view = g_new0(FlatView, 1);view->ref = 1;view->root = mr_root;memory_region_ref(mr_root);trace_flatview_new(view, mr_root);return view;
}
函数 render_memory_region(),定义如下:
/* Render a memory region into the global view. Ranges in @view obscure* ranges in @mr.*/
static void render_memory_region(FlatView *view,MemoryRegion *mr,Int128 base,AddrRange clip,bool readonly,bool nonvolatile,bool unmergeable)
{MemoryRegion *subregion;unsigned i;hwaddr offset_in_region;Int128 remain;Int128 now;FlatRange fr;AddrRange tmp;if (!mr->enabled) {return;}int128_addto(&base, int128_make64(mr->addr));readonly |= mr->readonly;nonvolatile |= mr->nonvolatile;unmergeable |= mr->unmergeable;tmp = addrrange_make(base, mr->size);if (!addrrange_intersects(tmp, clip)) {return;}clip = addrrange_intersection(tmp, clip);if (mr->alias) {int128_subfrom(&base, int128_make64(mr->alias->addr));int128_subfrom(&base, int128_make64(mr->alias_offset));render_memory_region(view, mr->alias, base, clip,readonly, nonvolatile, unmergeable);return;}/* Render subregions in priority order. */QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {render_memory_region(view, subregion, base, clip,readonly, nonvolatile, unmergeable);}if (!mr->terminates) {return;}offset_in_region = int128_get64(int128_sub(clip.start, base));base = clip.start;remain = clip.size;fr.mr = mr;fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);fr.romd_mode = mr->romd_mode;fr.readonly = readonly;fr.nonvolatile = nonvolatile;fr.unmergeable = unmergeable;/* Render the region itself into any gaps left by the current view. */for (i = 0; i < view->nr && int128_nz(remain); ++i) {if (int128_ge(base, addrrange_end(view->ranges[i].addr))) {continue;}if (int128_lt(base, view->ranges[i].addr.start)) {now = int128_min(remain,int128_sub(view->ranges[i].addr.start, base));fr.offset_in_region = offset_in_region;fr.addr = addrrange_make(base, now);flatview_insert(view, i, &fr);++i;int128_addto(&base, now);offset_in_region += int128_get64(now);int128_subfrom(&remain, now);}now = int128_sub(int128_min(int128_add(base, remain),addrrange_end(view->ranges[i].addr)),base);int128_addto(&base, now);offset_in_region += int128_get64(now);int128_subfrom(&remain, now);}if (int128_nz(remain)) {fr.offset_in_region = offset_in_region;fr.addr = addrrange_make(base, remain);flatview_insert(view, i, &fr);}
}
函数 addrrange_intersects(),定义如下:
static bool addrrange_contains(AddrRange range, Int128 addr)
{return int128_ge(addr, range.start)&& int128_lt(addr, addrrange_end(range));
}static bool addrrange_intersects(AddrRange r1, AddrRange r2)
{return addrrange_contains(r1, r2.start)|| addrrange_contains(r2, r1.start);
}
函数 addrrange_intersection(),定义如下:
static AddrRange addrrange_intersection(AddrRange r1, AddrRange r2)
{Int128 start = int128_max(r1.start, r2.start);Int128 end = int128_min(addrrange_end(r1), addrrange_end(r2));return addrrange_make(start, int128_sub(end, start));
}
函数 flatview_insert(),定义如下:
/* Insert a range into a given position. Caller is responsible for maintaining* sorting order.*/
static void flatview_insert(FlatView *view, unsigned pos, FlatRange *range)
{if (view->nr == view->nr_allocated) {view->nr_allocated = MAX(2 * view->nr, 10);view->ranges = g_realloc(view->ranges,view->nr_allocated * sizeof(*view->ranges));}memmove(view->ranges + pos + 1, view->ranges + pos,(view->nr - pos) * sizeof(FlatRange));view->ranges[pos] = *range;memory_region_ref(range->mr);++view->nr;
}
函数 flatview_simplify(),定义如下:
/* Attempt to simplify a view by merging adjacent ranges */
static void flatview_simplify(FlatView *view)
{unsigned i, j, k;i = 0;while (i < view->nr) {j = i + 1;while (j < view->nr&& can_merge(&view->ranges[j-1], &view->ranges[j])) {int128_addto(&view->ranges[i].addr.size, view->ranges[j].addr.size);++j;}++i;for (k = i; k < j; k++) {memory_region_unref(view->ranges[k].mr);}memmove(&view->ranges[i], &view->ranges[j],(view->nr - j) * sizeof(view->ranges[j]));view->nr -= j - i;}
}
函数 address_space_dispatch_new(),定义如下:
AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
{AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);uint16_t n;n = dummy_section(&d->map, fv, &io_mem_unassigned);assert(n == PHYS_SECTION_UNASSIGNED);d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };return d;
}
函数 section_from_flat_range(),定义如下:
static inline MemoryRegionSection
section_from_flat_range(FlatRange *fr, FlatView *fv)
{return (MemoryRegionSection) {.mr = fr->mr,.fv = fv,.offset_within_region = fr->offset_in_region,.size = fr->addr.size,.offset_within_address_space = int128_get64(fr->addr.start),.readonly = fr->readonly,.nonvolatile = fr->nonvolatile,.unmergeable = fr->unmergeable,};
}
函数 flatview_add_to_dispatch(),定义如下:
static void register_subpage(FlatView *fv, MemoryRegionSection *section)
{AddressSpaceDispatch *d = flatview_to_dispatch(fv);subpage_t *subpage;hwaddr base = section->offset_within_address_space& TARGET_PAGE_MASK;MemoryRegionSection *existing = phys_page_find(d, base);MemoryRegionSection subsection = {.offset_within_address_space = base,.size = int128_make64(TARGET_PAGE_SIZE),};hwaddr start, end;assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);if (!(existing->mr->subpage)) {subpage = subpage_init(fv, base);subsection.fv = fv;subsection.mr = &subpage->iomem;phys_page_set(d, base >> TARGET_PAGE_BITS, 1,phys_section_add(&d->map, &subsection));} else {subpage = container_of(existing->mr, subpage_t, iomem);}start = section->offset_within_address_space & ~TARGET_PAGE_MASK;end = start + int128_get64(section->size) - 1;subpage_register(subpage, start, end,phys_section_add(&d->map, section));
}static void register_multipage(FlatView *fv,MemoryRegionSection *section)
{AddressSpaceDispatch *d = flatview_to_dispatch(fv);hwaddr start_addr = section->offset_within_address_space;uint16_t section_index = phys_section_add(&d->map, section);uint64_t num_pages = int128_get64(int128_rshift(section->size,TARGET_PAGE_BITS));assert(num_pages);phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
}/** The range in *section* may look like this:** |s|PPPPPPP|s|** where s stands for subpage and P for page.*/
void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
{MemoryRegionSection remain = *section;Int128 page_size = int128_make64(TARGET_PAGE_SIZE);/* register first subpage */if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)- remain.offset_within_address_space;MemoryRegionSection now = remain;now.size = int128_min(int128_make64(left), now.size);register_subpage(fv, &now);if (int128_eq(remain.size, now.size)) {return;}remain.size = int128_sub(remain.size, now.size);remain.offset_within_address_space += int128_get64(now.size);remain.offset_within_region += int128_get64(now.size);}/* register whole pages */if (int128_ge(remain.size, page_size)) {MemoryRegionSection now = remain;now.size = int128_and(now.size, int128_neg(page_size));register_multipage(fv, &now);if (int128_eq(remain.size, now.size)) {return;}remain.size = int128_sub(remain.size, now.size);remain.offset_within_address_space += int128_get64(now.size);remain.offset_within_region += int128_get64(now.size);}/* register last subpage */register_subpage(fv, &remain);
}
函数 phys_page_compact(),定义如下:
/* Compact a non leaf page entry. Simply detect that the entry has a single child,* and update our entry so we can skip it and go directly to the destination.*/
static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
{unsigned valid_ptr = P_L2_SIZE;int valid = 0;PhysPageEntry *p;int i;if (lp->ptr == PHYS_MAP_NODE_NIL) {return;}p = nodes[lp->ptr];for (i = 0; i < P_L2_SIZE; i++) {if (p[i].ptr == PHYS_MAP_NODE_NIL) {continue;}valid_ptr = i;valid++;if (p[i].skip) {phys_page_compact(&p[i], nodes);}}/* We can only compress if there's only one child. */if (valid != 1) {return;}assert(valid_ptr < P_L2_SIZE);/* Don't compress if it won't fit in the # of bits we have. */if (P_L2_LEVELS >= (1 << 6) &&lp->skip + p[valid_ptr].skip >= (1 << 6)) {return;}lp->ptr = p[valid_ptr].ptr;if (!p[valid_ptr].skip) {/* If our only child is a leaf, make this a leaf. *//* By design, we should have made this node a leaf to begin with so we* should never reach here.* But since it's so simple to handle this, let's do it just in case we* change this rule.*/lp->skip = 0;} else {lp->skip += p[valid_ptr].skip;}
}void address_space_dispatch_compact(AddressSpaceDispatch *d)
{if (d->phys_map.skip) {phys_page_compact(&d->phys_map, d->map.nodes);}
}
函数 address_space_set_flatview(),定义如下:
static void address_space_set_flatview(AddressSpace *as)
{FlatView *old_view = address_space_to_flatview(as);MemoryRegion *physmr = memory_region_get_flatview_root(as->root);FlatView *new_view = g_hash_table_lookup(flat_views, physmr);assert(new_view);if (old_view == new_view) {return;}if (old_view) {flatview_ref(old_view);}flatview_ref(new_view);if (!QTAILQ_EMPTY(&as->listeners)) {FlatView tmpview = { .nr = 0 }, *old_view2 = old_view;if (!old_view2) {old_view2 = &tmpview;}address_space_update_topology_pass(as, old_view2, new_view, false);address_space_update_topology_pass(as, old_view2, new_view, true);}/* Writes are protected by the BQL. */qatomic_rcu_set(&as->current_map, new_view);if (old_view) {flatview_unref(old_view);}/* Note that all the old MemoryRegions are still alive up to this* point. This relieves most MemoryListeners from the need to* ref/unref the MemoryRegions they get---unless they use them* outside the iothread mutex, in which case precise reference* counting is necessary.*/if (old_view) {flatview_unref(old_view);}
}
函数 address_space_update_topology_pass(),定义如下:
static void address_space_update_topology_pass(AddressSpace *as,const FlatView *old_view,const FlatView *new_view,bool adding)
{unsigned iold, inew;FlatRange *frold, *frnew;/* Generate a symmetric difference of the old and new memory maps.* Kill ranges in the old map, and instantiate ranges in the new map.*/iold = inew = 0;while (iold < old_view->nr || inew < new_view->nr) {if (iold < old_view->nr) {frold = &old_view->ranges[iold];} else {frold = NULL;}if (inew < new_view->nr) {frnew = &new_view->ranges[inew];} else {frnew = NULL;}if (frold&& (!frnew|| int128_lt(frold->addr.start, frnew->addr.start)|| (int128_eq(frold->addr.start, frnew->addr.start)&& !flatrange_equal(frold, frnew)))) {/* In old but not in new, or in both but attributes changed. */if (!adding) {flat_range_coalesced_io_del(frold, as);MEMORY_LISTENER_UPDATE_REGION(frold, as, Reverse, region_del);}++iold;} else if (frold && frnew && flatrange_equal(frold, frnew)) {/* In both and unchanged (except logging may have changed) */if (adding) {MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_nop);if (frnew->dirty_log_mask & ~frold->dirty_log_mask) {MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, log_start,frold->dirty_log_mask,frnew->dirty_log_mask);}if (frold->dirty_log_mask & ~frnew->dirty_log_mask) {MEMORY_LISTENER_UPDATE_REGION(frnew, as, Reverse, log_stop,frold->dirty_log_mask,frnew->dirty_log_mask);}}++iold;++inew;} else {/* In new */if (adding) {MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_add);flat_range_coalesced_io_add(frnew, as);}++inew;}}
}
函数 memory_listener_register(),定义如下:
void memory_listener_register(MemoryListener *listener, AddressSpace *as)
{MemoryListener *other = NULL;/* Only one of them can be defined for a listener */assert(!(listener->log_sync && listener->log_sync_global));listener->address_space = as;if (QTAILQ_EMPTY(&memory_listeners)|| listener->priority >= QTAILQ_LAST(&memory_listeners)->priority) {QTAILQ_INSERT_TAIL(&memory_listeners, listener, link);} else {QTAILQ_FOREACH(other, &memory_listeners, link) {if (listener->priority < other->priority) {break;}}QTAILQ_INSERT_BEFORE(other, listener, link);}if (QTAILQ_EMPTY(&as->listeners)|| listener->priority >= QTAILQ_LAST(&as->listeners)->priority) {QTAILQ_INSERT_TAIL(&as->listeners, listener, link_as);} else {QTAILQ_FOREACH(other, &as->listeners, link_as) {if (listener->priority < other->priority) {break;}}QTAILQ_INSERT_BEFORE(other, listener, link_as);}listener_add_address_space(listener, as);if (listener->eventfd_add || listener->eventfd_del) {as->ioeventfd_notifiers++;}
}
函数 tcg_cpu_machine_done(),定义如下:
static void tcg_cpu_machine_done(Notifier *n, void *unused)
{X86CPU *cpu = container_of(n, X86CPU, machine_done);MemoryRegion *smram =(MemoryRegion *) object_resolve_path("/machine/smram", NULL);if (smram) {cpu->smram = g_new(MemoryRegion, 1);memory_region_init_alias(cpu->smram, OBJECT(cpu), "smram",smram, 0, 4 * GiB);memory_region_set_enabled(cpu->smram, true);memory_region_add_subregion_overlap(cpu->cpu_as_root, 0,cpu->smram, 1);}
}
函数 qemu_add_machine_init_done_notifier(),定义如下:
void qemu_add_machine_init_done_notifier(Notifier *notify)
{notifier_list_add(&machine_init_done_notifiers, notify);if (phase_check(PHASE_MACHINE_READY)) {notify->notify(notify, NULL);}
}
函数 tcg_cpu_class_init(),定义如下:
static const struct TCGCPUOps x86_tcg_ops = {.initialize = tcg_x86_init,.synchronize_from_tb = x86_cpu_synchronize_from_tb,.restore_state_to_opc = x86_restore_state_to_opc,.cpu_exec_enter = x86_cpu_exec_enter,.cpu_exec_exit = x86_cpu_exec_exit,
#ifdef CONFIG_USER_ONLY.fake_user_interrupt = x86_cpu_do_interrupt,.record_sigsegv = x86_cpu_record_sigsegv,.record_sigbus = x86_cpu_record_sigbus,
#else.tlb_fill = x86_cpu_tlb_fill,.do_interrupt = x86_cpu_do_interrupt,.cpu_exec_interrupt = x86_cpu_exec_interrupt,.do_unaligned_access = x86_cpu_do_unaligned_access,.debug_excp_handler = breakpoint_handler,.debug_check_breakpoint = x86_debug_check_breakpoint,
#endif /* !CONFIG_USER_ONLY */
};static void tcg_cpu_init_ops(AccelCPUClass *accel_cpu, CPUClass *cc)
{/* for x86, all cpus use the same set of operations */cc->tcg_ops = &x86_tcg_ops;
}static void tcg_cpu_class_init(CPUClass *cc)
{cc->init_accel_cpu = tcg_cpu_init_ops;
}
函数 tcg_cpu_class_init(),定义如下:
static void tcg_cpu_xsave_init(void)
{
#define XO(bit, field) \x86_ext_save_areas[bit].offset = offsetof(X86XSaveArea, field);XO(XSTATE_FP_BIT, legacy);XO(XSTATE_SSE_BIT, legacy);XO(XSTATE_YMM_BIT, avx_state);XO(XSTATE_BNDREGS_BIT, bndreg_state);XO(XSTATE_BNDCSR_BIT, bndcsr_state);XO(XSTATE_OPMASK_BIT, opmask_state);XO(XSTATE_ZMM_Hi256_BIT, zmm_hi256_state);XO(XSTATE_Hi16_ZMM_BIT, hi16_zmm_state);XO(XSTATE_PKRU_BIT, pkru_state);#undef XO
}/** TCG-specific defaults that override cpudef models when using TCG.* Only for builtin_x86_defs models initialized with x86_register_cpudef_types.*/
static PropValue tcg_default_props[] = {{ "vme", "off" },{ NULL, NULL },
};static void tcg_cpu_instance_init(CPUState *cs)
{X86CPU *cpu = X86_CPU(cs);X86CPUClass *xcc = X86_CPU_GET_CLASS(cpu);if (xcc->model) {/* Special cases not set in the X86CPUDefinition structs: */x86_cpu_apply_props(cpu, tcg_default_props);}tcg_cpu_xsave_init();
}
回到函数 cpu_exec_realizefn(),继续将初始化好的 CPU 对象加入 CPU 列表中,函数 cpu_list_add() 定义如下:
void cpu_list_add(CPUState *cpu)
{QEMU_LOCK_GUARD(&qemu_cpu_list_lock);if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) {cpu->cpu_index = cpu_get_free_index();assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);} else {assert(!cpu_index_auto_assigned);}QTAILQ_INSERT_TAIL_RCU(&cpus_queue, cpu, node);cpu_list_generation_id++;
}
函数 qdev_get_vmsd() 定义如下:
const VMStateDescription *qdev_get_vmsd(DeviceState *dev)
{DeviceClass *dc = DEVICE_GET_CLASS(dev);return dc->vmsd;
}
返回到函数 x86_cpu_realizefn() 继续执行,函数 x86_cpu_get_versioned_cache_info() 定义如下:
static const CPUCaches *x86_cpu_get_versioned_cache_info(X86CPU *cpu,X86CPUModel *model)
{const X86CPUVersionDefinition *vdef;X86CPUVersion version = x86_cpu_model_resolve_version(model);const CPUCaches *cache_info = model->cpudef->cache_info;if (version == CPU_VERSION_LEGACY) {return cache_info;}for (vdef = x86_cpu_def_get_versions(model->cpudef); vdef->version; vdef++) {if (vdef->cache_info) {cache_info = vdef->cache_info;}if (vdef->version == version) {break;}}assert(vdef->version == version);return cache_info;
}
函数 x86_cpu_model_last_version() 定义如下:
static X86CPUVersion x86_cpu_model_last_version(const X86CPUModel *model)
{int v = 0;const X86CPUVersionDefinition *vdef =x86_cpu_def_get_versions(model->cpudef);while (vdef->version) {v = vdef->version;vdef++;}return v;
}/* Return the actual version being used for a specific CPU model */
static X86CPUVersion x86_cpu_model_resolve_version(const X86CPUModel *model)
{X86CPUVersion v = model->version;if (v == CPU_VERSION_AUTO) {v = default_cpu_version;}if (v == CPU_VERSION_LATEST) {return x86_cpu_model_last_version(model);}return v;
}
函数 x86_cpu_def_get_versions() 定义如下:
static const X86CPUVersionDefinition *
x86_cpu_def_get_versions(const X86CPUDefinition *def)
{/* When X86CPUDefinition::versions is NULL, we register only v1 */static const X86CPUVersionDefinition default_version_list[] = {{ 1 },{ /* end of list */ }};return def->versions ?: default_version_list;
}
函数 x86_cpu_machine_reset_cb() 定义如下:
/* TODO: remove me, when reset over QOM tree is implemented */
void x86_cpu_machine_reset_cb(void *opaque)
{X86CPU *cpu = opaque;cpu_reset(CPU(cpu));
}
函数 x86_cpu_apic_create() 定义如下:
void x86_cpu_apic_create(X86CPU *cpu, Error **errp)
{APICCommonState *apic;APICCommonClass *apic_class = apic_get_class(errp);if (!apic_class) {return;}cpu->apic_state = DEVICE(object_new_with_class(OBJECT_CLASS(apic_class)));object_property_add_child(OBJECT(cpu), "lapic",OBJECT(cpu->apic_state));object_unref(OBJECT(cpu->apic_state));qdev_prop_set_uint32(cpu->apic_state, "id", cpu->apic_id);/* TODO: convert to link<> */apic = APIC_COMMON(cpu->apic_state);apic->cpu = cpu;apic->apicbase = APIC_DEFAULT_ADDRESS | MSR_IA32_APICBASE_ENABLE;
}
函数 mce_init() 定义如下:
static void mce_init(X86CPU *cpu)
{CPUX86State *cenv = &cpu->env;unsigned int bank;if (((cenv->cpuid_version >> 8) & 0xf) >= 6&& (cenv->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==(CPUID_MCE | CPUID_MCA)) {cenv->mcg_cap = MCE_CAP_DEF | MCE_BANKS_DEF |(cpu->enable_lmce ? MCG_LMCE_P : 0);cenv->mcg_ctl = ~(uint64_t)0;for (bank = 0; bank < MCE_BANKS_DEF; bank++) {cenv->mce_banks[bank * 4] = ~(uint64_t)0;}}
}
函数 qemu_init_vcpu() 定义如下:
void qemu_init_vcpu(CPUState *cpu)
{MachineState *ms = MACHINE(qdev_get_machine());cpu->nr_cores = machine_topo_get_cores_per_socket(ms);cpu->nr_threads = ms->smp.threads;cpu->stopped = true;cpu->random_seed = qemu_guest_random_seed_thread_part1();if (!cpu->as) {/* If the target cpu hasn't set up any address spaces itself,* give it the default one.*/cpu->num_ases = 1;cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);}/* accelerators all implement the AccelOpsClass */g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);cpus_accel->create_vcpu_thread(cpu);while (!cpu->created) {qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);}
}
函数 qemu_init_vcpu() 定义如下:
unsigned int machine_topo_get_cores_per_socket(const MachineState *ms)
{return ms->smp.cores * ms->smp.clusters * ms->smp.dies;
}
函数 qemu_guest_random_seed_thread_part1() 定义如下:
uint64_t qemu_guest_random_seed_thread_part1(void)
{if (deterministic) {uint64_t ret;glib_random_bytes(&ret, sizeof(ret));return ret;}return 0;
}
在函数 qemu_init_vcpu() 中,以下代码为每个虚拟 CPU 建立运行实例线程:
cpus_accel->create_vcpu_thread(cpu);
函数 create_vcpu_thread() 根据加速器不同而有不同实现,以 WHPX 加速器为例:
static void whpx_start_vcpu_thread(CPUState *cpu)
{char thread_name[VCPU_THREAD_NAME_SIZE];cpu->thread = g_new0(QemuThread, 1);cpu->halt_cond = g_new0(QemuCond, 1);qemu_cond_init(cpu->halt_cond);snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",cpu->cpu_index);qemu_thread_create(cpu->thread, thread_name, whpx_cpu_thread_fn,cpu, QEMU_THREAD_JOINABLE);
}
函数 qemu_thread_create() 定义如下:
void qemu_thread_create(QemuThread *thread, const char *name,void *(*start_routine)(void *),void *arg, int mode)
{HANDLE hThread;struct QemuThreadData *data;data = g_malloc(sizeof *data);data->start_routine = start_routine;data->arg = arg;data->mode = mode;data->exited = false;notifier_list_init(&data->exit);if (data->mode != QEMU_THREAD_DETACHED) {InitializeCriticalSection(&data->cs);}hThread = (HANDLE) _beginthreadex(NULL, 0, win32_start_routine,data, 0, &thread->tid);if (!hThread) {error_exit(GetLastError(), __func__);}if (name_threads && name && !set_thread_description(hThread, name)) {fprintf(stderr, "qemu: failed to set thread description: %s\n", name);}CloseHandle(hThread);thread->data = data;
}
函数 x86_cpu_apic_realize() 定义如下:
void x86_cpu_apic_realize(X86CPU *cpu, Error **errp)
{APICCommonState *apic;static bool apic_mmio_map_once;if (cpu->apic_state == NULL) {return;}qdev_realize(DEVICE(cpu->apic_state), NULL, errp);/* Map APIC MMIO area */apic = APIC_COMMON(cpu->apic_state);if (!apic_mmio_map_once) {memory_region_add_subregion_overlap(get_system_memory(),apic->apicbase &MSR_IA32_APICBASE_BASE,&apic->io_memory,0x1000);apic_mmio_map_once = true;}
}
函数 cpu_reset() 定义如下:
void cpu_reset(CPUState *cpu)
{device_cold_reset(DEVICE(cpu));trace_cpu_reset(cpu->cpu_index);
}
函数 device_cold_reset() 定义如下:
void device_cold_reset(DeviceState *dev)
{resettable_reset(OBJECT(dev), RESET_TYPE_COLD);
}
函数 resettable_reset() 定义如下:
/*** enter_phase_in_progress:* True if we are currently in reset enter phase.** exit_phase_in_progress:* count the number of exit phase we are in.** Note: These flags are only used to guarantee (using asserts) that the reset* API is used correctly. We can use global variables because we rely on the* iothread mutex to ensure only one reset operation is in a progress at a* given time.*/
static bool enter_phase_in_progress;
static unsigned exit_phase_in_progress;void resettable_reset(Object *obj, ResetType type)
{trace_resettable_reset(obj, type);resettable_assert_reset(obj, type);resettable_release_reset(obj, type);
}void resettable_assert_reset(Object *obj, ResetType type)
{/* TODO: change this assert when adding support for other reset types */assert(type == RESET_TYPE_COLD);trace_resettable_reset_assert_begin(obj, type);assert(!enter_phase_in_progress);enter_phase_in_progress = true;resettable_phase_enter(obj, NULL, type);enter_phase_in_progress = false;resettable_phase_hold(obj, NULL, type);trace_resettable_reset_assert_end(obj);
}void resettable_release_reset(Object *obj, ResetType type)
{/* TODO: change this assert when adding support for other reset types */assert(type == RESET_TYPE_COLD);trace_resettable_reset_release_begin(obj, type);assert(!enter_phase_in_progress);exit_phase_in_progress += 1;resettable_phase_exit(obj, NULL, type);exit_phase_in_progress -= 1;trace_resettable_reset_release_end(obj);
}
qemu_create_cli_devices()
函数 qemu_create_cli_devices(),定义如下:
static void qemu_create_cli_devices(void)
{DeviceOption *opt;soundhw_init();qemu_opts_foreach(qemu_find_opts("fw_cfg"),parse_fw_cfg, fw_cfg_find(), &error_fatal);/* init USB devices */if (machine_usb(current_machine)) {if (foreach_device_config(DEV_USB, usb_parse) < 0)exit(1);}/* init generic devices */rom_set_order_override(FW_CFG_ORDER_OVERRIDE_DEVICE);qemu_opts_foreach(qemu_find_opts("device"),device_init_func, NULL, &error_fatal);QTAILQ_FOREACH(opt, &device_opts, next) {DeviceState *dev;loc_push_restore(&opt->loc);/** TODO Eventually we should call qmp_device_add() here to make sure it* behaves the same, but QMP still has to accept incorrectly typed* options until libvirt is fixed and we want to be strict on the CLI* from the start, so call qdev_device_add_from_qdict() directly for* now.*/dev = qdev_device_add_from_qdict(opt->opts, true, &error_fatal);object_unref(OBJECT(dev));loc_pop(&opt->loc);}rom_reset_order_override();
}
函数 soundhw_init(),定义如下:
void soundhw_init(void)
{struct soundhw *c = selected;ISABus *isa_bus = (ISABus *) object_resolve_path_type("", TYPE_ISA_BUS, NULL);PCIBus *pci_bus = (PCIBus *) object_resolve_path_type("", TYPE_PCI_BUS, NULL);BusState *bus;if (!c) {return;}if (c->isa) {if (!isa_bus) {error_report("ISA bus not available for %s", c->name);exit(1);}bus = BUS(isa_bus);} else {if (!pci_bus) {error_report("PCI bus not available for %s", c->name);exit(1);}bus = BUS(pci_bus);}if (c->typename) {DeviceState *dev = qdev_new(c->typename);qdev_prop_set_string(dev, "audiodev", audiodev_id);qdev_realize_and_unref(dev, bus, &error_fatal);} else {assert(!c->isa);c->init_pci(pci_bus, audiodev_id);}
}
函数 machine_usb(),定义如下:
bool machine_usb(MachineState *machine)
{return machine->usb;
}
函数 rom_set_order_override(),定义如下:
void rom_set_order_override(int order)
{if (!fw_cfg)return;fw_cfg_set_order_override(fw_cfg, order);
}
函数 fw_cfg_set_order_override(),定义如下:
void fw_cfg_set_order_override(FWCfgState *s, int order)
{assert(s->fw_cfg_order_override == 0);s->fw_cfg_order_override = order;
}
函数 rom_reset_order_override(),定义如下:
void rom_reset_order_override(void)
{if (!fw_cfg)return;fw_cfg_reset_order_override(fw_cfg);
}
函数 fw_cfg_reset_order_override(),定义如下:
void fw_cfg_reset_order_override(FWCfgState *s)
{assert(s->fw_cfg_order_override != 0);s->fw_cfg_order_override = 0;
}
qemu_machine_creation_done()
函数 fw_cfg_reset_order_override(),定义如下:
static void qemu_machine_creation_done(void)
{MachineState *machine = MACHINE(qdev_get_machine());/* Did we create any drives that we failed to create a device for? */drive_check_orphaned();/* Don't warn about the default network setup that you get if* no command line -net or -netdev options are specified. There* are two cases that we would otherwise complain about:* (1) board doesn't support a NIC but the implicit "-net nic"* requested one* (2) CONFIG_SLIRP not set, in which case the implicit "-net nic"* sets up a nic that isn't connected to anything.*/if (!default_net && (!qtest_enabled() || has_defaults)) {net_check_clients();}qdev_prop_check_globals();qdev_machine_creation_done();if (machine->cgs) {/** Verify that Confidential Guest Support has actually been initialized*/assert(machine->cgs->ready);}if (foreach_device_config(DEV_GDB, gdbserver_start) < 0) {exit(1);}if (!vga_interface_created && !default_vga &&vga_interface_type != VGA_NONE) {warn_report("A -vga option was passed but this machine ""type does not use that option; ""No VGA device has been created");}
}
函数 drive_check_orphaned(),定义如下:
/** Check board claimed all -drive that are meant to be claimed.* Fatal error if any remain unclaimed.*/
void drive_check_orphaned(void)
{BlockBackend *blk;DriveInfo *dinfo;Location loc;bool orphans = false;GLOBAL_STATE_CODE();for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {dinfo = blk_legacy_dinfo(blk);/** Ignore default drives, because we create certain default* drives unconditionally, then leave them unclaimed. Not the* users fault.* Ignore IF_VIRTIO or IF_XEN, because it gets desugared into* -device, so we can leave failing to -device.* Ignore IF_NONE, because leaving unclaimed IF_NONE remains* available for device_add is a feature.*/if (dinfo->is_default || dinfo->type == IF_VIRTIO|| dinfo->type == IF_XEN || dinfo->type == IF_NONE) {continue;}if (!blk_get_attached_dev(blk)) {loc_push_none(&loc);qemu_opts_loc_restore(dinfo->opts);error_report("machine type does not support"" if=%s,bus=%d,unit=%d",if_name[dinfo->type], dinfo->bus, dinfo->unit);loc_pop(&loc);orphans = true;}}if (orphans) {exit(1);}
}
函数 net_check_clients(),定义如下:
void net_check_clients(void)
{NetClientState *nc;int i;net_hub_check_clients();QTAILQ_FOREACH(nc, &net_clients, next) {if (!nc->peer) {warn_report("%s %s has no peer",nc->info->type == NET_CLIENT_DRIVER_NIC? "nic" : "netdev",nc->name);}}/* Check that all NICs requested via -net nic actually got created.* NICs created via -device don't need to be checked here because* they are always instantiated.*/for (i = 0; i < MAX_NICS; i++) {NICInfo *nd = &nd_table[i];if (nd->used && !nd->instantiated) {warn_report("requested NIC (%s, model %s) ""was not created (not supported by this machine?)",nd->name ? nd->name : "anonymous",nd->model ? nd->model : "unspecified");}}
}
函数 net_hub_check_clients(),定义如下:
/*** Warn if hub configurations are likely wrong*/
void net_hub_check_clients(void)
{NetHub *hub;NetHubPort *port;NetClientState *peer;QLIST_FOREACH(hub, &hubs, next) {int has_nic = 0, has_host_dev = 0;QLIST_FOREACH(port, &hub->ports, next) {peer = port->nc.peer;if (!peer) {warn_report("hub port %s has no peer", port->nc.name);continue;}switch (peer->info->type) {case NET_CLIENT_DRIVER_NIC:has_nic = 1;break;case NET_CLIENT_DRIVER_USER:case NET_CLIENT_DRIVER_TAP:case NET_CLIENT_DRIVER_SOCKET:case NET_CLIENT_DRIVER_STREAM:case NET_CLIENT_DRIVER_DGRAM:case NET_CLIENT_DRIVER_VDE:case NET_CLIENT_DRIVER_VHOST_USER:has_host_dev = 1;break;default:break;}}if (has_host_dev && !has_nic) {warn_report("hub %d with no nics", hub->id);}if (has_nic && !has_host_dev && !qtest_enabled()) {warn_report("hub %d is not connected to host network", hub->id);}}
}
函数 qdev_prop_check_globals(),定义如下:
int qdev_prop_check_globals(void)
{int i, ret = 0;for (i = 0; i < global_props()->len; i++) {GlobalProperty *prop;ObjectClass *oc;DeviceClass *dc;prop = g_ptr_array_index(global_props(), i);if (prop->used) {continue;}oc = object_class_by_name(prop->driver);oc = object_class_dynamic_cast(oc, TYPE_DEVICE);if (!oc) {warn_report("global %s.%s has invalid class name",prop->driver, prop->property);ret = 1;continue;}dc = DEVICE_CLASS(oc);if (!dc->hotpluggable && !prop->used) {warn_report("global %s.%s=%s not used",prop->driver, prop->property, prop->value);ret = 1;continue;}}return ret;
}
函数 qdev_machine_creation_done(),定义如下:
void qdev_machine_creation_done(void)
{cpu_synchronize_all_post_init();if (current_machine->boot_config.once) {qemu_boot_set(current_machine->boot_config.once, &error_fatal);qemu_register_reset(restore_boot_order, g_strdup(current_machine->boot_config.order));}/** ok, initial machine setup is done, starting from now we can* only create hotpluggable devices*/phase_advance(PHASE_MACHINE_READY);qdev_assert_realized_properly();/* TODO: once all bus devices are qdevified, this should be done* when bus is created by qdev.c *//** TODO: If we had a main 'reset container' that the whole system* lived in, we could reset that using the multi-phase reset* APIs. For the moment, we just reset the sysbus, which will cause* all devices hanging off it (and all their child buses, recursively)* to be reset. Note that this will *not* reset any Device objects* which are not attached to some part of the qbus tree!*/qemu_register_reset(resettable_cold_reset_fn, sysbus_get_default());notifier_list_notify(&machine_init_done_notifiers, NULL);if (rom_check_and_register_reset() != 0) {exit(1);}replay_start();/* This checkpoint is required by replay to separate prior clockreading from the other reads, because timer polling functions queryclock values from the log. */replay_checkpoint(CHECKPOINT_RESET);qemu_system_reset(SHUTDOWN_CAUSE_NONE);register_global_state();
}
函数 cpu_synchronize_all_post_init(),定义如下:
void cpu_synchronize_all_post_init(void)
{CPUState *cpu;CPU_FOREACH(cpu) {cpu_synchronize_post_init(cpu);}
}
函数 cpu_synchronize_all_post_init(),定义如下:
void cpu_synchronize_post_init(CPUState *cpu)
{if (cpus_accel->synchronize_post_init) {cpus_accel->synchronize_post_init(cpu);}
}
函数 synchronize_post_init() 根据每个加速器不同,有不同的实现,以 WHPX 为例:
函数 whpx_cpu_synchronize_post_init(),定义如下:
void whpx_cpu_synchronize_post_init(CPUState *cpu)
{run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
}
函数 whpx_cpu_synchronize_post_init(),定义如下:
static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,run_on_cpu_data arg)
{whpx_set_registers(cpu, WHPX_SET_FULL_STATE);cpu->vcpu_dirty = false;
}
函数 whpx_set_registers(),定义如下:
/* state subset only touched by the VCPU itself during runtime */
#define WHPX_SET_RUNTIME_STATE 1
/* state subset modified during VCPU reset */
#define WHPX_SET_RESET_STATE 2
/* full state set, modified during initialization or on vmload */
#define WHPX_SET_FULL_STATE 3static void whpx_set_registers(CPUState *cpu, int level)
{struct whpx_state *whpx = &whpx_global;AccelCPUState *vcpu = cpu->accel;X86CPU *x86_cpu = X86_CPU(cpu);CPUX86State *env = &x86_cpu->env;struct whpx_register_set vcxt;HRESULT hr;int idx;int idx_next;int i;int v86, r86;assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));/** Following MSRs have side effects on the guest or are too heavy for* runtime. Limit them to full state update.*/if (level >= WHPX_SET_RESET_STATE) {whpx_set_tsc(cpu);}memset(&vcxt, 0, sizeof(struct whpx_register_set));v86 = (env->eflags & VM_MASK);r86 = !(env->cr[0] & CR0_PE_MASK);vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);idx = 0;/* Indexes for first 16 registers match between HV and QEMU definitions */idx_next = 16;for (idx = 0; idx < CPU_NB_REGS; idx += 1) {vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];}idx = idx_next;/* Same goes for RIP and RFLAGS */assert(whpx_register_names[idx] == WHvX64RegisterRip);vcxt.values[idx++].Reg64 = env->eip;assert(whpx_register_names[idx] == WHvX64RegisterRflags);vcxt.values[idx++].Reg64 = env->eflags;/* Translate 6+4 segment registers. HV and QEMU order matches */assert(idx == WHvX64RegisterEs);for (i = 0; i < 6; i += 1, idx += 1) {vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);}assert(idx == WHvX64RegisterLdtr);vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);assert(idx == WHvX64RegisterTr);vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);assert(idx == WHvX64RegisterIdtr);vcxt.values[idx].Table.Base = env->idt.base;vcxt.values[idx].Table.Limit = env->idt.limit;idx += 1;assert(idx == WHvX64RegisterGdtr);vcxt.values[idx].Table.Base = env->gdt.base;vcxt.values[idx].Table.Limit = env->gdt.limit;idx += 1;/* CR0, 2, 3, 4, 8 */assert(whpx_register_names[idx] == WHvX64RegisterCr0);vcxt.values[idx++].Reg64 = env->cr[0];assert(whpx_register_names[idx] == WHvX64RegisterCr2);vcxt.values[idx++].Reg64 = env->cr[2];assert(whpx_register_names[idx] == WHvX64RegisterCr3);vcxt.values[idx++].Reg64 = env->cr[3];assert(whpx_register_names[idx] == WHvX64RegisterCr4);vcxt.values[idx++].Reg64 = env->cr[4];assert(whpx_register_names[idx] == WHvX64RegisterCr8);vcxt.values[idx++].Reg64 = vcpu->tpr;/* 8 Debug Registers - Skipped *//** Extended control registers needs to be handled separately depending* on whether xsave is supported/enabled or not.*/whpx_set_xcrs(cpu);/* 16 XMM registers */assert(whpx_register_names[idx] == WHvX64RegisterXmm0);idx_next = idx + 16;for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);}idx = idx_next;/* 8 FP registers */assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);for (i = 0; i < 8; i += 1, idx += 1) {vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);/* vcxt.values[idx].Fp.AsUINT128.High64 =env->fpregs[i].mmx.MMX_Q(1);*/}/* FP control status register */assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;vcxt.values[idx].FpControlStatus.FpStatus =(env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;vcxt.values[idx].FpControlStatus.FpTag = 0;for (i = 0; i < 8; ++i) {vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;}vcxt.values[idx].FpControlStatus.Reserved = 0;vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;idx += 1;/* XMM control status register */assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;idx += 1;/* MSRs */assert(whpx_register_names[idx] == WHvX64RegisterEfer);vcxt.values[idx++].Reg64 = env->efer;
#ifdef TARGET_X86_64assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);vcxt.values[idx++].Reg64 = env->kernelgsbase;
#endifassert(whpx_register_names[idx] == WHvX64RegisterApicBase);vcxt.values[idx++].Reg64 = vcpu->apic_base;/* WHvX64RegisterPat - Skipped */assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);vcxt.values[idx++].Reg64 = env->sysenter_cs;assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);vcxt.values[idx++].Reg64 = env->sysenter_eip;assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);vcxt.values[idx++].Reg64 = env->sysenter_esp;assert(whpx_register_names[idx] == WHvX64RegisterStar);vcxt.values[idx++].Reg64 = env->star;
#ifdef TARGET_X86_64assert(whpx_register_names[idx] == WHvX64RegisterLstar);vcxt.values[idx++].Reg64 = env->lstar;assert(whpx_register_names[idx] == WHvX64RegisterCstar);vcxt.values[idx++].Reg64 = env->cstar;assert(whpx_register_names[idx] == WHvX64RegisterSfmask);vcxt.values[idx++].Reg64 = env->fmask;
#endif/* Interrupt / Event Registers - Skipped */assert(idx == RTL_NUMBER_OF(whpx_register_names));hr = whp_dispatch.WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index,whpx_register_names,RTL_NUMBER_OF(whpx_register_names),&vcxt.values[0]);if (FAILED(hr)) {error_report("WHPX: Failed to set virtual processor context, hr=%08lx",hr);}return;
}
函数 whpx_set_tsc(),定义如下:
static int whpx_set_tsc(CPUState *cpu)
{CPUX86State *env = cpu_env(cpu);WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;WHV_REGISTER_VALUE tsc_val;HRESULT hr;struct whpx_state *whpx = &whpx_global;/** Suspend the partition prior to setting the TSC to reduce the variance* in TSC across vCPUs. When the first vCPU runs post suspend, the* partition is automatically resumed.*/if (whp_dispatch.WHvSuspendPartitionTime) {/** Unable to suspend partition while setting TSC is not a fatal* error. It just increases the likelihood of TSC variance between* vCPUs and some guest OS are able to handle that just fine.*/hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);if (FAILED(hr)) {warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);}}tsc_val.Reg64 = env->tsc;hr = whp_dispatch.WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);if (FAILED(hr)) {error_report("WHPX: Failed to set TSC, hr=%08lx", hr);return -1;}return 0;
}
函数 whpx_apic_tpr_to_cr8(),定义如下:
/** The CR8 register in the CPU is mapped to the TPR register of the APIC,* however, they use a slightly different encoding. Specifically:** APIC.TPR[bits 7:4] = CR8[bits 3:0]** This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64* and IA-32 Architectures Software Developer's Manual.** The functions below translate the value of CR8 to TPR and vice versa.*/static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
{return tpr >> 4;
}static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
{return cr8 << 4;
}
函数 whpx_seg_q2h(),定义如下:
static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,int r86)
{WHV_X64_SEGMENT_REGISTER hs;unsigned flags = qs->flags;hs.Base = qs->base;hs.Limit = qs->limit;hs.Selector = qs->selector;if (v86) {hs.Attributes = 0;hs.SegmentType = 3;hs.Present = 1;hs.DescriptorPrivilegeLevel = 3;hs.NonSystemSegment = 1;} else {hs.Attributes = (flags >> DESC_TYPE_SHIFT);if (r86) {/* hs.Base &= 0xfffff; */}}return hs;
}
函数 whpx_set_xcrs(),定义如下:
/* X64 Extended Control Registers */
static void whpx_set_xcrs(CPUState *cpu)
{CPUX86State *env = cpu_env(cpu);HRESULT hr;struct whpx_state *whpx = &whpx_global;WHV_REGISTER_VALUE xcr0;WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;if (!whpx_has_xsave()) {return;}/* Only xcr0 is supported by the hypervisor currently */xcr0.Reg64 = env->xcr0;hr = whp_dispatch.WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);if (FAILED(hr)) {error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);}
}
函数 whpx_has_xsave(),定义如下:
static bool whpx_has_xsave(void)
{return whpx_xsave_cap.XsaveSupport;
}
函数 qemu_boot_set(),定义如下:
void qemu_boot_set(const char *boot_order, Error **errp)
{Error *local_err = NULL;if (!boot_set_handler) {error_setg(errp, "no function defined to set boot device list for"" this architecture");return;}validate_bootdevices(boot_order, &local_err);if (local_err) {error_propagate(errp, local_err);return;}boot_set_handler(boot_set_opaque, boot_order, errp);
}
函数 validate_bootdevices(),定义如下:
void validate_bootdevices(const char *devices, Error **errp)
{/* We just do some generic consistency checks */const char *p;int bitmap = 0;for (p = devices; *p != '\0'; p++) {/* Allowed boot devices are:* a-b: floppy disk drives* c-f: IDE disk drives* g-m: machine implementation dependent drives* n-p: network devices* It's up to each machine implementation to check if the given boot* devices match the actual hardware implementation and firmware* features.*/if (*p < 'a' || *p > 'p') {error_setg(errp, "Invalid boot device '%c'", *p);return;}if (bitmap & (1 << (*p - 'a'))) {error_setg(errp, "Boot device '%c' was given twice", *p);return;}bitmap |= 1 << (*p - 'a');}
}
函数 restore_boot_order(),定义如下:
void restore_boot_order(void *opaque)
{char *normal_boot_order = opaque;static int first = 1;/* Restore boot order and remove ourselves after the first boot */if (first) {first = 0;return;}if (boot_set_handler) {qemu_boot_set(normal_boot_order, &error_abort);}qemu_unregister_reset(restore_boot_order, normal_boot_order);g_free(normal_boot_order);
}
load_snapshot()
函数 load_snapshot(),定义如下:
bool load_snapshot(const char *name, const char *vmstate,bool has_devices, strList *devices, Error **errp)
{BlockDriverState *bs_vm_state;QEMUSnapshotInfo sn;QEMUFile *f;int ret;AioContext *aio_context;MigrationIncomingState *mis = migration_incoming_get_current();if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {return false;}ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);if (ret < 0) {return false;}if (ret == 0) {error_setg(errp, "Snapshot '%s' does not exist in one or more devices",name);return false;}bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);if (!bs_vm_state) {return false;}aio_context = bdrv_get_aio_context(bs_vm_state);/* Don't even try to load empty VM states */aio_context_acquire(aio_context);ret = bdrv_snapshot_find(bs_vm_state, &sn, name);aio_context_release(aio_context);if (ret < 0) {return false;} else if (sn.vm_state_size == 0) {error_setg(errp, "This is a disk-only snapshot. Revert to it "" offline using qemu-img");return false;}/** Flush the record/replay queue. Now the VM state is going* to change. Therefore we don't need to preserve its consistency*/replay_flush_events();/* Flush all IO requests so they don't interfere with the new state. */bdrv_drain_all_begin();ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);if (ret < 0) {goto err_drain;}/* restore the VM state */f = qemu_fopen_bdrv(bs_vm_state, 0);if (!f) {error_setg(errp, "Could not open VM state file");goto err_drain;}qemu_system_reset(SHUTDOWN_CAUSE_SNAPSHOT_LOAD);mis->from_src_file = f;if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {ret = -EINVAL;goto err_drain;}aio_context_acquire(aio_context);ret = qemu_loadvm_state(f);migration_incoming_state_destroy();aio_context_release(aio_context);bdrv_drain_all_end();if (ret < 0) {error_setg(errp, "Error %d while loading VM state", ret);return false;}return true;err_drain:bdrv_drain_all_end();return false;
}
函数 bdrv_all_can_snapshot(),定义如下:
/* Group operations. All block drivers are involved.* These functions will properly handle dataplane (take aio_context_acquire* when appropriate for appropriate block drivers) */bool bdrv_all_can_snapshot(bool has_devices, strList *devices,Error **errp)
{g_autoptr(GList) bdrvs = NULL;GList *iterbdrvs;GLOBAL_STATE_CODE();GRAPH_RDLOCK_GUARD_MAINLOOP();if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {return false;}iterbdrvs = bdrvs;while (iterbdrvs) {BlockDriverState *bs = iterbdrvs->data;AioContext *ctx = bdrv_get_aio_context(bs);bool ok = true;aio_context_acquire(ctx);if (devices || bdrv_all_snapshots_includes_bs(bs)) {ok = bdrv_can_snapshot(bs);}aio_context_release(ctx);if (!ok) {error_setg(errp, "Device '%s' is writable but does not support ""snapshots", bdrv_get_device_or_node_name(bs));return false;}iterbdrvs = iterbdrvs->next;}return true;
}
函数 bdrv_all_has_snapshot(),定义如下:
int bdrv_all_has_snapshot(const char *name,bool has_devices, strList *devices,Error **errp)
{g_autoptr(GList) bdrvs = NULL;GList *iterbdrvs;GLOBAL_STATE_CODE();GRAPH_RDLOCK_GUARD_MAINLOOP();if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {return -1;}iterbdrvs = bdrvs;while (iterbdrvs) {BlockDriverState *bs = iterbdrvs->data;AioContext *ctx = bdrv_get_aio_context(bs);QEMUSnapshotInfo sn;int ret = 0;aio_context_acquire(ctx);if (devices || bdrv_all_snapshots_includes_bs(bs)) {ret = bdrv_snapshot_find(bs, &sn, name);}aio_context_release(ctx);if (ret < 0) {if (ret == -ENOENT) {return 0;} else {error_setg_errno(errp, errno,"Could not check snapshot '%s' on '%s'",name, bdrv_get_device_or_node_name(bs));return -1;}}iterbdrvs = iterbdrvs->next;}return 1;
}
函数 bdrv_all_find_vmstate_bs(),定义如下:
BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,bool has_devices, strList *devices,Error **errp)
{g_autoptr(GList) bdrvs = NULL;GList *iterbdrvs;GLOBAL_STATE_CODE();GRAPH_RDLOCK_GUARD_MAINLOOP();if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {return NULL;}iterbdrvs = bdrvs;while (iterbdrvs) {BlockDriverState *bs = iterbdrvs->data;AioContext *ctx = bdrv_get_aio_context(bs);bool found = false;aio_context_acquire(ctx);found = (devices || bdrv_all_snapshots_includes_bs(bs)) &&bdrv_can_snapshot(bs);aio_context_release(ctx);if (vmstate_bs) {if (g_str_equal(vmstate_bs,bdrv_get_node_name(bs))) {if (found) {return bs;} else {error_setg(errp,"vmstate block device '%s' does not support snapshots",vmstate_bs);return NULL;}}} else if (found) {return bs;}iterbdrvs = iterbdrvs->next;}if (vmstate_bs) {error_setg(errp,"vmstate block device '%s' does not exist", vmstate_bs);} else {error_setg(errp,"no block device can store vmstate for snapshot");}return NULL;
}
函数 replay_flush_events(),定义如下:
void replay_flush_events(void)
{if (replay_mode == REPLAY_MODE_NONE) {return;}g_assert(replay_mutex_locked());while (!QTAILQ_EMPTY(&events_list)) {Event *event = QTAILQ_FIRST(&events_list);replay_run_event(event);QTAILQ_REMOVE(&events_list, event, events);g_free(event);}
}
函数 replay_run_event(),定义如下:
static void replay_run_event(Event *event)
{switch (event->event_kind) {case REPLAY_ASYNC_EVENT_BH:aio_bh_call(event->opaque);break;case REPLAY_ASYNC_EVENT_BH_ONESHOT:((QEMUBHFunc *)event->opaque)(event->opaque2);break;case REPLAY_ASYNC_EVENT_INPUT:qemu_input_event_send_impl(NULL, (InputEvent *)event->opaque);qapi_free_InputEvent((InputEvent *)event->opaque);break;case REPLAY_ASYNC_EVENT_INPUT_SYNC:qemu_input_event_sync_impl();break;case REPLAY_ASYNC_EVENT_CHAR_READ:replay_event_char_read_run(event->opaque);break;case REPLAY_ASYNC_EVENT_BLOCK:aio_bh_call(event->opaque);break;case REPLAY_ASYNC_EVENT_NET:replay_event_net_run(event->opaque);break;default:error_report("Replay: invalid async event ID (%d) in the queue",event->event_kind);exit(1);break;}
}
函数 bdrv_drain_all_begin(),定义如下:
void coroutine_mixed_fn bdrv_drain_all_begin(void)
{BlockDriverState *bs = NULL;if (qemu_in_coroutine()) {bdrv_co_yield_to_drain(NULL, true, NULL, true);return;}/** bdrv queue is managed by record/replay,* waiting for finishing the I/O requests may* be infinite*/if (replay_events_enabled()) {return;}bdrv_drain_all_begin_nopoll();/* Now poll the in-flight requests */AIO_WAIT_WHILE_UNLOCKED(NULL, bdrv_drain_all_poll());while ((bs = bdrv_next_all_states(bs))) {bdrv_drain_assert_idle(bs);}
}
函数 bdrv_all_goto_snapshot(),定义如下:
int bdrv_all_goto_snapshot(const char *name,bool has_devices, strList *devices,Error **errp)
{g_autoptr(GList) bdrvs = NULL;GList *iterbdrvs;int ret;GLOBAL_STATE_CODE();bdrv_graph_rdlock_main_loop();ret = bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp);bdrv_graph_rdunlock_main_loop();if (ret < 0) {return -1;}iterbdrvs = bdrvs;while (iterbdrvs) {BlockDriverState *bs = iterbdrvs->data;AioContext *ctx = bdrv_get_aio_context(bs);bool all_snapshots_includes_bs;aio_context_acquire(ctx);bdrv_graph_rdlock_main_loop();all_snapshots_includes_bs = bdrv_all_snapshots_includes_bs(bs);bdrv_graph_rdunlock_main_loop();ret = (devices || all_snapshots_includes_bs) ?bdrv_snapshot_goto(bs, name, errp) : 0;aio_context_release(ctx);if (ret < 0) {bdrv_graph_rdlock_main_loop();error_prepend(errp, "Could not load snapshot '%s' on '%s': ",name, bdrv_get_device_or_node_name(bs));bdrv_graph_rdunlock_main_loop();return -1;}iterbdrvs = iterbdrvs->next;}return 0;
}
函数 qemu_fopen_bdrv(),定义如下:
static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
{if (is_writable) {return qemu_file_new_output(QIO_CHANNEL(qio_channel_block_new(bs)));} else {return qemu_file_new_input(QIO_CHANNEL(qio_channel_block_new(bs)));}
}
函数 qemu_system_reset(),定义如下:
/** Reset the VM. Issue an event unless @reason is SHUTDOWN_CAUSE_NONE.*/
void qemu_system_reset(ShutdownCause reason)
{MachineClass *mc;mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL;cpu_synchronize_all_states();if (mc && mc->reset) {mc->reset(current_machine, reason);} else {qemu_devices_reset(reason);}switch (reason) {case SHUTDOWN_CAUSE_NONE:case SHUTDOWN_CAUSE_SUBSYSTEM_RESET:case SHUTDOWN_CAUSE_SNAPSHOT_LOAD:break;default:qapi_event_send_reset(shutdown_caused_by_guest(reason), reason);}cpu_synchronize_all_post_reset();
}
函数 cpu_synchronize_all_states(),定义如下:
void cpu_synchronize_all_states(void)
{CPUState *cpu;CPU_FOREACH(cpu) {cpu_synchronize_state(cpu);}
}
函数 cpu_synchronize_state(),定义如下:
void cpu_synchronize_state(CPUState *cpu)
{if (cpus_accel->synchronize_state) {cpus_accel->synchronize_state(cpu);}
}
函数 synchronize_state() 根据不同的加速器,有不同的实现,以 WHPX 为例:
函数 whpx_cpu_synchronize_state(),定义如下:
void whpx_cpu_synchronize_state(CPUState *cpu)
{if (!cpu->vcpu_dirty) {run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);}
}
函数 do_whpx_cpu_synchronize_state(),定义如下:
static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{if (!cpu->vcpu_dirty) {whpx_get_registers(cpu);cpu->vcpu_dirty = true;}
}
函数 whpx_get_registers(),定义如下:
static void whpx_get_registers(CPUState *cpu)
{struct whpx_state *whpx = &whpx_global;AccelCPUState *vcpu = cpu->accel;X86CPU *x86_cpu = X86_CPU(cpu);CPUX86State *env = &x86_cpu->env;struct whpx_register_set vcxt;uint64_t tpr, apic_base;HRESULT hr;int idx;int idx_next;int i;assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));if (!env->tsc_valid) {whpx_get_tsc(cpu);env->tsc_valid = !runstate_is_running();}hr = whp_dispatch.WHvGetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index,whpx_register_names,RTL_NUMBER_OF(whpx_register_names),&vcxt.values[0]);if (FAILED(hr)) {error_report("WHPX: Failed to get virtual processor context, hr=%08lx",hr);}if (whpx_apic_in_platform()) {/** Fetch the TPR value from the emulated APIC. It may get overwritten* below with the value from CR8 returned by* WHvGetVirtualProcessorRegisters().*/whpx_apic_get(x86_cpu->apic_state);vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));}idx = 0;/* Indexes for first 16 registers match between HV and QEMU definitions */idx_next = 16;for (idx = 0; idx < CPU_NB_REGS; idx += 1) {env->regs[idx] = vcxt.values[idx].Reg64;}idx = idx_next;/* Same goes for RIP and RFLAGS */assert(whpx_register_names[idx] == WHvX64RegisterRip);env->eip = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterRflags);env->eflags = vcxt.values[idx++].Reg64;/* Translate 6+4 segment registers. HV and QEMU order matches */assert(idx == WHvX64RegisterEs);for (i = 0; i < 6; i += 1, idx += 1) {env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);}assert(idx == WHvX64RegisterLdtr);env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);assert(idx == WHvX64RegisterTr);env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);assert(idx == WHvX64RegisterIdtr);env->idt.base = vcxt.values[idx].Table.Base;env->idt.limit = vcxt.values[idx].Table.Limit;idx += 1;assert(idx == WHvX64RegisterGdtr);env->gdt.base = vcxt.values[idx].Table.Base;env->gdt.limit = vcxt.values[idx].Table.Limit;idx += 1;/* CR0, 2, 3, 4, 8 */assert(whpx_register_names[idx] == WHvX64RegisterCr0);env->cr[0] = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterCr2);env->cr[2] = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterCr3);env->cr[3] = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterCr4);env->cr[4] = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterCr8);tpr = vcxt.values[idx++].Reg64;if (tpr != vcpu->tpr) {vcpu->tpr = tpr;cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));}/* 8 Debug Registers - Skipped *//** Extended control registers needs to be handled separately depending* on whether xsave is supported/enabled or not.*/whpx_get_xcrs(cpu);/* 16 XMM registers */assert(whpx_register_names[idx] == WHvX64RegisterXmm0);idx_next = idx + 16;for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;}idx = idx_next;/* 8 FP registers */assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);for (i = 0; i < 8; i += 1, idx += 1) {env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;/* env->fpregs[i].mmx.MMX_Q(1) =vcxt.values[idx].Fp.AsUINT128.High64;*/}/* FP control status register */assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;for (i = 0; i < 8; ++i) {env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);}env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;idx += 1;/* XMM control status register */assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;idx += 1;/* MSRs */assert(whpx_register_names[idx] == WHvX64RegisterEfer);env->efer = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);env->kernelgsbase = vcxt.values[idx++].Reg64;
#endifassert(whpx_register_names[idx] == WHvX64RegisterApicBase);apic_base = vcxt.values[idx++].Reg64;if (apic_base != vcpu->apic_base) {vcpu->apic_base = apic_base;cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);}/* WHvX64RegisterPat - Skipped */assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);env->sysenter_cs = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);env->sysenter_eip = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);env->sysenter_esp = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterStar);env->star = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64assert(whpx_register_names[idx] == WHvX64RegisterLstar);env->lstar = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterCstar);env->cstar = vcxt.values[idx++].Reg64;assert(whpx_register_names[idx] == WHvX64RegisterSfmask);env->fmask = vcxt.values[idx++].Reg64;
#endif/* Interrupt / Event Registers - Skipped */assert(idx == RTL_NUMBER_OF(whpx_register_names));if (whpx_apic_in_platform()) {whpx_apic_get(x86_cpu->apic_state);}x86_update_hflags(env);return;
}
函数 x86_update_hflags(),定义如下:
void x86_update_hflags(CPUX86State *env)
{uint32_t hflags;
#define HFLAG_COPY_MASK \~( HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)hflags = env->hflags & HFLAG_COPY_MASK;hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &(HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));if (env->cr[4] & CR4_OSFXSR_MASK) {hflags |= HF_OSFXSR_MASK;}if (env->efer & MSR_EFER_LMA) {hflags |= HF_LMA_MASK;}if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;} else {hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>(DESC_B_SHIFT - HF_CS32_SHIFT);hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>(DESC_B_SHIFT - HF_SS32_SHIFT);if (!(env->cr[0] & CR0_PE_MASK) || (env->eflags & VM_MASK) ||!(hflags & HF_CS32_MASK)) {hflags |= HF_ADDSEG_MASK;} else {hflags |= ((env->segs[R_DS].base | env->segs[R_ES].base |env->segs[R_SS].base) != 0) << HF_ADDSEG_SHIFT;}}env->hflags = hflags;
}
函数 yank_register_instance(),定义如下:
bool yank_register_instance(const YankInstance *instance, Error **errp)
{YankInstanceEntry *entry;QEMU_LOCK_GUARD(&yank_lock);if (yank_find_entry(instance)) {error_setg(errp, "duplicate yank instance");return false;}entry = g_new0(YankInstanceEntry, 1);entry->instance = QAPI_CLONE(YankInstance, instance);QLIST_INIT(&entry->yankfns);QLIST_INSERT_HEAD(&yank_instance_list, entry, next);return true;
}
函数 qemu_loadvm_state(),定义如下:
int qemu_loadvm_state(QEMUFile *f)
{MigrationIncomingState *mis = migration_incoming_get_current();Error *local_err = NULL;int ret;if (qemu_savevm_state_blocked(&local_err)) {error_report_err(local_err);return -EINVAL;}ret = qemu_loadvm_state_header(f);if (ret) {return ret;}if (qemu_loadvm_state_setup(f) != 0) {return -EINVAL;}if (migrate_switchover_ack()) {qemu_loadvm_state_switchover_ack_needed(mis);}cpu_synchronize_all_pre_loadvm();ret = qemu_loadvm_state_main(f, mis);qemu_event_set(&mis->main_thread_load_event);trace_qemu_loadvm_state_post_main(ret);if (mis->have_listen_thread) {/* Listen thread still going, can't clean up yet */return ret;}if (ret == 0) {ret = qemu_file_get_error(f);}/** Try to read in the VMDESC section as well, so that dumping tools that* intercept our migration stream have the chance to see it.*//* We've got to be careful; if we don't read the data and just shut the fd* then the sender can error if we close while it's still sending.* We also mustn't read data that isn't there; some transports (RDMA)* will stall waiting for that data when the source has already closed.*/if (ret == 0 && should_send_vmdesc()) {uint8_t *buf;uint32_t size;uint8_t section_type = qemu_get_byte(f);if (section_type != QEMU_VM_VMDESCRIPTION) {error_report("Expected vmdescription section, but got %d",section_type);/** It doesn't seem worth failing at this point since* we apparently have an otherwise valid VM state*/} else {buf = g_malloc(0x1000);size = qemu_get_be32(f);while (size > 0) {uint32_t read_chunk = MIN(size, 0x1000);qemu_get_buffer(f, buf, read_chunk);size -= read_chunk;}g_free(buf);}}qemu_loadvm_state_cleanup();cpu_synchronize_all_post_init();return ret;
}
函数 qemu_loadvm_state_header(),定义如下:
static int qemu_loadvm_state_header(QEMUFile *f)
{unsigned int v;int ret;v = qemu_get_be32(f);if (v != QEMU_VM_FILE_MAGIC) {error_report("Not a migration stream");return -EINVAL;}v = qemu_get_be32(f);if (v == QEMU_VM_FILE_VERSION_COMPAT) {error_report("SaveVM v2 format is obsolete and don't work anymore");return -ENOTSUP;}if (v != QEMU_VM_FILE_VERSION) {error_report("Unsupported migration stream version");return -ENOTSUP;}if (migrate_get_current()->send_configuration) {if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {error_report("Configuration section missing");qemu_loadvm_state_cleanup();return -EINVAL;}ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);if (ret) {qemu_loadvm_state_cleanup();return ret;}}return 0;
}
函数 qemu_loadvm_state_setup(),定义如下:
static int qemu_loadvm_state_setup(QEMUFile *f)
{SaveStateEntry *se;int ret;trace_loadvm_state_setup();QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {if (!se->ops || !se->ops->load_setup) {continue;}if (se->ops->is_active) {if (!se->ops->is_active(se->opaque)) {continue;}}ret = se->ops->load_setup(f, se->opaque);if (ret < 0) {qemu_file_set_error(f, ret);error_report("Load state of device %s failed", se->idstr);return ret;}}return 0;
}
函数 migrate_switchover_ack(),定义如下:
bool migrate_switchover_ack(void)
{MigrationState *s = migrate_get_current();return s->capabilities[MIGRATION_CAPABILITY_SWITCHOVER_ACK];
}
函数 qemu_loadvm_state_switchover_ack_needed(),定义如下:
static void qemu_loadvm_state_switchover_ack_needed(MigrationIncomingState *mis)
{SaveStateEntry *se;QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {if (!se->ops || !se->ops->switchover_ack_needed) {continue;}if (se->ops->switchover_ack_needed(se->opaque)) {mis->switchover_ack_pending_num++;}}trace_loadvm_state_switchover_ack_needed(mis->switchover_ack_pending_num);
}
函数 cpu_synchronize_all_pre_loadvm(),定义如下:
void cpu_synchronize_all_pre_loadvm(void)
{CPUState *cpu;CPU_FOREACH(cpu) {cpu_synchronize_pre_loadvm(cpu);}
}
函数 cpu_synchronize_pre_loadvm(),定义如下:
void cpu_synchronize_pre_loadvm(CPUState *cpu)
{if (cpus_accel->synchronize_pre_loadvm) {cpus_accel->synchronize_pre_loadvm(cpu);}
}
函数 cpu_synchronize_pre_loadvm(),定义如下:
void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
{run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
}
函数 do_whpx_cpu_synchronize_pre_loadvm(),定义如下:
static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,run_on_cpu_data arg)
{cpu->vcpu_dirty = true;
}
函数 qemu_loadvm_state_main(),定义如下:
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
{uint8_t section_type;int ret = 0;retry:while (true) {section_type = qemu_get_byte(f);ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL);if (ret) {break;}trace_qemu_loadvm_state_section(section_type);switch (section_type) {case QEMU_VM_SECTION_START:case QEMU_VM_SECTION_FULL:ret = qemu_loadvm_section_start_full(f, mis, section_type);if (ret < 0) {goto out;}break;case QEMU_VM_SECTION_PART:case QEMU_VM_SECTION_END:ret = qemu_loadvm_section_part_end(f, mis, section_type);if (ret < 0) {goto out;}break;case QEMU_VM_COMMAND:ret = loadvm_process_command(f);trace_qemu_loadvm_state_section_command(ret);if ((ret < 0) || (ret == LOADVM_QUIT)) {goto out;}break;case QEMU_VM_EOF:/* This is the end of migration */goto out;default:error_report("Unknown savevm section type %d", section_type);ret = -EINVAL;goto out;}}out:if (ret < 0) {qemu_file_set_error(f, ret);/* Cancel bitmaps incoming regardless of recovery */dirty_bitmap_mig_cancel_incoming();/** If we are during an active postcopy, then we pause instead* of bail out to at least keep the VM's dirty data. Note* that POSTCOPY_INCOMING_LISTENING stage is still not enough,* during which we're still receiving device states and we* still haven't yet started the VM on destination.** Only RAM postcopy supports recovery. Still, if RAM postcopy is* enabled, canceled bitmaps postcopy will not affect RAM postcopy* recovering.*/if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {/* Reset f to point to the newly created channel */f = mis->from_src_file;goto retry;}}return ret;
}
函数 qemu_loadvm_section_start_full(),定义如下:
static int
qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,uint8_t type)
{bool trace_downtime = (type == QEMU_VM_SECTION_FULL);uint32_t instance_id, version_id, section_id;int64_t start_ts, end_ts;SaveStateEntry *se;char idstr[256];int ret;/* Read section start */section_id = qemu_get_be32(f);if (!qemu_get_counted_string(f, idstr)) {error_report("Unable to read ID string for section %u",section_id);return -EINVAL;}instance_id = qemu_get_be32(f);version_id = qemu_get_be32(f);ret = qemu_file_get_error(f);if (ret) {error_report("%s: Failed to read instance/version ID: %d",__func__, ret);return ret;}trace_qemu_loadvm_state_section_startfull(section_id, idstr,instance_id, version_id);/* Find savevm section */se = find_se(idstr, instance_id);if (se == NULL) {error_report("Unknown savevm section or instance '%s' %"PRIu32". ""Make sure that your current VM setup matches your ""saved VM setup, including any hotplugged devices",idstr, instance_id);return -EINVAL;}/* Validate version */if (version_id > se->version_id) {error_report("savevm: unsupported version %d for '%s' v%d",version_id, idstr, se->version_id);return -EINVAL;}se->load_version_id = version_id;se->load_section_id = section_id;/* Validate if it is a device's state */if (xen_enabled() && se->is_ram) {error_report("loadvm: %s RAM loading not allowed on Xen", idstr);return -EINVAL;}if (trace_downtime) {start_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);}ret = vmstate_load(f, se);if (ret < 0) {error_report("error while loading state for instance 0x%"PRIx32" of"" device '%s'", instance_id, idstr);return ret;}if (trace_downtime) {end_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);trace_vmstate_downtime_load("non-iterable", se->idstr,se->instance_id, end_ts - start_ts);}if (!check_section_footer(f, se)) {return -EINVAL;}return 0;
}
函数 vmstate_load(),定义如下:
static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
{trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");if (!se->vmsd) { /* Old style */return se->ops->load_state(f, se->opaque, se->load_version_id);}return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
}
函数 vmstate_load_state(),定义如下:
int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,void *opaque, int version_id)
{const VMStateField *field = vmsd->fields;int ret = 0;trace_vmstate_load_state(vmsd->name, version_id);if (version_id > vmsd->version_id) {error_report("%s: incoming version_id %d is too new ""for local version_id %d",vmsd->name, version_id, vmsd->version_id);trace_vmstate_load_state_end(vmsd->name, "too new", -EINVAL);return -EINVAL;}if (version_id < vmsd->minimum_version_id) {error_report("%s: incoming version_id %d is too old ""for local minimum version_id %d",vmsd->name, version_id, vmsd->minimum_version_id);trace_vmstate_load_state_end(vmsd->name, "too old", -EINVAL);return -EINVAL;}if (vmsd->pre_load) {ret = vmsd->pre_load(opaque);if (ret) {return ret;}}while (field->name) {trace_vmstate_load_state_field(vmsd->name, field->name);if (vmstate_field_exists(vmsd, field, opaque, version_id)) {void *first_elem = opaque + field->offset;int i, n_elems = vmstate_n_elems(opaque, field);int size = vmstate_size(opaque, field);vmstate_handle_alloc(first_elem, field, opaque);if (field->flags & VMS_POINTER) {first_elem = *(void **)first_elem;assert(first_elem || !n_elems || !size);}for (i = 0; i < n_elems; i++) {void *curr_elem = first_elem + size * i;if (field->flags & VMS_ARRAY_OF_POINTER) {curr_elem = *(void **)curr_elem;}if (!curr_elem && size) {/* if null pointer check placeholder and do not follow */assert(field->flags & VMS_ARRAY_OF_POINTER);ret = vmstate_info_nullptr.get(f, curr_elem, size, NULL);} else if (field->flags & VMS_STRUCT) {ret = vmstate_load_state(f, field->vmsd, curr_elem,field->vmsd->version_id);} else if (field->flags & VMS_VSTRUCT) {ret = vmstate_load_state(f, field->vmsd, curr_elem,field->struct_version_id);} else {ret = field->info->get(f, curr_elem, size, field);}if (ret >= 0) {ret = qemu_file_get_error(f);}if (ret < 0) {qemu_file_set_error(f, ret);error_report("Failed to load %s:%s", vmsd->name,field->name);trace_vmstate_load_field_error(field->name, ret);return ret;}}} else if (field->flags & VMS_MUST_EXIST) {error_report("Input validation failed: %s/%s",vmsd->name, field->name);return -1;}field++;}assert(field->flags == VMS_END);ret = vmstate_subsection_load(f, vmsd, opaque);if (ret != 0) {qemu_file_set_error(f, ret);return ret;}if (vmsd->post_load) {ret = vmsd->post_load(opaque, version_id);}trace_vmstate_load_state_end(vmsd->name, "end", ret);return ret;
}
函数 check_section_footer(),定义如下:
/** Read a footer off the wire and check that it matches the expected section** Returns: true if the footer was good* false if there is a problem (and calls error_report to say why)*/
static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
{int ret;uint8_t read_mark;uint32_t read_section_id;if (!migrate_get_current()->send_section_footer) {/* No footer to check */return true;}read_mark = qemu_get_byte(f);ret = qemu_file_get_error(f);if (ret) {error_report("%s: Read section footer failed: %d",__func__, ret);return false;}if (read_mark != QEMU_VM_SECTION_FOOTER) {error_report("Missing section footer for %s", se->idstr);return false;}read_section_id = qemu_get_be32(f);if (read_section_id != se->load_section_id) {error_report("Mismatched section id in footer for %s -"" read 0x%x expected 0x%x",se->idstr, read_section_id, se->load_section_id);return false;}/* All good */return true;
}
函数 qemu_loadvm_section_part_end(),定义如下:
static int
qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,uint8_t type)
{bool trace_downtime = (type == QEMU_VM_SECTION_END);int64_t start_ts, end_ts;uint32_t section_id;SaveStateEntry *se;int ret;section_id = qemu_get_be32(f);ret = qemu_file_get_error(f);if (ret) {error_report("%s: Failed to read section ID: %d",__func__, ret);return ret;}trace_qemu_loadvm_state_section_partend(section_id);QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {if (se->load_section_id == section_id) {break;}}if (se == NULL) {error_report("Unknown savevm section %d", section_id);return -EINVAL;}if (trace_downtime) {start_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);}ret = vmstate_load(f, se);if (ret < 0) {error_report("error while loading state section id %d(%s)",section_id, se->idstr);return ret;}if (trace_downtime) {end_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);trace_vmstate_downtime_load("iterable", se->idstr,se->instance_id, end_ts - start_ts);}if (!check_section_footer(f, se)) {return -EINVAL;}return 0;
}
函数 qemu_loadvm_section_part_end(),定义如下:
/** Process an incoming 'QEMU_VM_COMMAND'* 0 just a normal return* LOADVM_QUIT All good, but exit the loop* <0 Error*/
static int loadvm_process_command(QEMUFile *f)
{MigrationIncomingState *mis = migration_incoming_get_current();uint16_t cmd;uint16_t len;uint32_t tmp32;cmd = qemu_get_be16(f);len = qemu_get_be16(f);/* Check validity before continue processing of cmds */if (qemu_file_get_error(f)) {return qemu_file_get_error(f);}if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);return -EINVAL;}trace_loadvm_process_command(mig_cmd_args[cmd].name, len);if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {error_report("%s received with bad length - expecting %zu, got %d",mig_cmd_args[cmd].name,(size_t)mig_cmd_args[cmd].len, len);return -ERANGE;}switch (cmd) {case MIG_CMD_OPEN_RETURN_PATH:if (mis->to_src_file) {error_report("CMD_OPEN_RETURN_PATH called when RP already open");/* Not really a problem, so don't give up */return 0;}mis->to_src_file = qemu_file_get_return_path(f);if (!mis->to_src_file) {error_report("CMD_OPEN_RETURN_PATH failed");return -1;}/** Switchover ack is enabled but no device uses it, so send an ACK to* source that it's OK to switchover. Do it here, after return path has* been created.*/if (migrate_switchover_ack() && !mis->switchover_ack_pending_num) {int ret = migrate_send_rp_switchover_ack(mis);if (ret) {error_report("Could not send switchover ack RP MSG, err %d (%s)", ret,strerror(-ret));return ret;}}break;case MIG_CMD_PING:tmp32 = qemu_get_be32(f);trace_loadvm_process_command_ping(tmp32);if (!mis->to_src_file) {error_report("CMD_PING (0x%x) received with no return path",tmp32);return -1;}migrate_send_rp_pong(mis, tmp32);break;case MIG_CMD_PACKAGED:return loadvm_handle_cmd_packaged(mis);case MIG_CMD_POSTCOPY_ADVISE:return loadvm_postcopy_handle_advise(mis, len);case MIG_CMD_POSTCOPY_LISTEN:return loadvm_postcopy_handle_listen(mis);case MIG_CMD_POSTCOPY_RUN:return loadvm_postcopy_handle_run(mis);case MIG_CMD_POSTCOPY_RAM_DISCARD:return loadvm_postcopy_ram_handle_discard(mis, len);case MIG_CMD_POSTCOPY_RESUME:return loadvm_postcopy_handle_resume(mis);case MIG_CMD_RECV_BITMAP:return loadvm_handle_recv_bitmap(mis, len);case MIG_CMD_ENABLE_COLO:return loadvm_process_enable_colo(mis);}return 0;
}
函数 qemu_loadvm_state_cleanup(),定义如下:
void qemu_loadvm_state_cleanup(void)
{SaveStateEntry *se;trace_loadvm_state_cleanup();QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {if (se->ops && se->ops->load_cleanup) {se->ops->load_cleanup(se->opaque);}}
}
函数 cpu_synchronize_all_post_init(),定义如下:
void cpu_synchronize_all_post_init(void)
{CPUState *cpu;CPU_FOREACH(cpu) {cpu_synchronize_post_init(cpu);}
}
函数 cpu_synchronize_all_post_init(),定义如下:
void cpu_synchronize_post_init(CPUState *cpu)
{if (cpus_accel->synchronize_post_init) {cpus_accel->synchronize_post_init(cpu);}
}
函数 whpx_cpu_synchronize_post_init(),定义如下:
void whpx_cpu_synchronize_post_init(CPUState *cpu)
{run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
}
函数 do_whpx_cpu_synchronize_post_init(),定义如下:
static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,run_on_cpu_data arg)
{whpx_set_registers(cpu, WHPX_SET_FULL_STATE);cpu->vcpu_dirty = false;
}
函数 bdrv_drain_all_end(),定义如下:
void bdrv_drain_all_end(void)
{BlockDriverState *bs = NULL;GLOBAL_STATE_CODE();/** bdrv queue is managed by record/replay,* waiting for finishing the I/O requests may* be endless*/if (replay_events_enabled()) {return;}while ((bs = bdrv_next_all_states(bs))) {AioContext *aio_context = bdrv_get_aio_context(bs);aio_context_acquire(aio_context);bdrv_do_drained_end(bs, NULL);aio_context_release(aio_context);}assert(qemu_get_current_aio_context() == qemu_get_aio_context());assert(bdrv_drain_all_count > 0);bdrv_drain_all_count--;
}
replay_vmstate_init()
函数 replay_vmstate_init(),定义如下:
void replay_vmstate_init(void)
{Error *err = NULL;if (replay_snapshot) {if (replay_mode == REPLAY_MODE_RECORD) {if (!save_snapshot(replay_snapshot,true, NULL, false, NULL, &err)) {error_report_err(err);error_report("Could not create snapshot for icount record");exit(1);}} else if (replay_mode == REPLAY_MODE_PLAY) {if (!load_snapshot(replay_snapshot, NULL, false, NULL, &err)) {error_report_err(err);error_report("Could not load snapshot for icount replay");exit(1);}}}
}
qmp_migrate_incoming()
函数 qmp_migrate_incoming() 在 /migration/migration.c 文件中,定义如下:
void qmp_migrate_incoming(const char *uri, bool has_channels,MigrationChannelList *channels, Error **errp)
{Error *local_err = NULL;static bool once = true;if (!once) {error_setg(errp, "The incoming migration has already been started");return;}if (!runstate_check(RUN_STATE_INMIGRATE)) {error_setg(errp, "'-incoming' was not specified on the command line");return;}if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {return;}qemu_start_incoming_migration(uri, has_channels, channels, &local_err);if (local_err) {yank_unregister_instance(MIGRATION_YANK_INSTANCE);error_propagate(errp, local_err);return;}once = false;
}
qmp_cont()
函数 qmp_cont() 在 /monitor/qmp-cmds.c 文件中,定义如下:
void qmp_cont(Error **errp)
{BlockBackend *blk;BlockJob *job;Error *local_err = NULL;/* if there is a dump in background, we should wait until the dump* finished */if (qemu_system_dump_in_progress()) {error_setg(errp, "There is a dump in process, please wait.");return;}if (runstate_needs_reset()) {error_setg(errp, "Resetting the Virtual Machine is required");return;} else if (runstate_check(RUN_STATE_SUSPENDED)) {return;} else if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {error_setg(errp, "Migration is not finalized yet");return;}for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {blk_iostatus_reset(blk);}WITH_JOB_LOCK_GUARD() {for (job = block_job_next_locked(NULL); job;job = block_job_next_locked(job)) {block_job_iostatus_reset_locked(job);}}/* Continuing after completed migration. Images have been inactivated to* allow the destination to take control. Need to get control back now.** If there are no inactive block nodes (e.g. because the VM was just* paused rather than completing a migration), bdrv_inactivate_all() simply* doesn't do anything. */bdrv_activate_all(&local_err);if (local_err) {error_propagate(errp, local_err);return;}if (runstate_check(RUN_STATE_INMIGRATE)) {autostart = 1;} else {vm_start();}
}
函数 qemu_system_dump_in_progress() 定义如下:
bool qemu_system_dump_in_progress(void)
{DumpState *state = &dump_state_global;return (qatomic_read(&state->status) == DUMP_STATUS_ACTIVE);
}
函数 runstate_needs_reset() 在 /system/runstate.c 文件中,定义如下:
bool runstate_needs_reset(void)
{return runstate_check(RUN_STATE_INTERNAL_ERROR) ||runstate_check(RUN_STATE_SHUTDOWN);
}
函数 runstate_needs_reset() 在 /block/block-backend.c 文件中,定义如下:
void blk_iostatus_reset(BlockBackend *blk)
{GLOBAL_STATE_CODE();if (blk_iostatus_is_enabled(blk)) {blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;}
}
总结
以上分析了 QEMU 系统仿真在启动过程中,QEMU 系统仿真预配置的的解析代码。