As i promised in previous chapter, lets have a closer look at kvm_callbacks. This is
initialised in
static int __init vmx_init(void) {
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
}
we need to dig deep into vmx_x86_ops in chapter vmx.c. But in this chapter we will have at
when & why & who are calling this functions.
kvm_callbacks functions
1. /// For 8bit IO reads from the guest (Usually when executing 'inb')
int (*inb)(void *opaque, uint16_t addr, uint8_t *data);
In previous chapter, you would have seen switch case statement for run->exit_reason. Upon vm_enter, cpu
will start executing guest code on cpu directly. If it try to execute any previleged instruction or in
other words, any instruction that cannot be done in cpu vm mode, cpu will exit and will do below switch-case
statement to take care of that case.
switch (run->exit_reason) {
vm_exit happened when it try to do a IO operation. io could be a simulated device or a real one.
Both needs the vm to exit out of vm context.
case KVM_EXIT_IO:
/*
* based on size of io. this will invoke kvm->callbacks->inb() or
* kvm->callbacks->inw(), or kvm->callbacks->inl() ...and based on
* direction it may call kvm->callbacks->outb(), kvm->callbacks->outw()
* or kvm->callbacks->outl().
*/
2. /// generic memory reads to unmapped memory (For MMIO devices)
int (*mmio_read)(void *opaque, uint64_t addr, uint8_t *data,
int len);
Let us explain mmio_read by taking an example. we can emulate so many devices with
qemu system. One of them is e1000 nic. You have seen in chapter #1 , how pc_init1() is
invoked.
pc_init1() {
pci_nic_init(pci_bus, nd, -1) {
if (strcmp(nd->model, "e1000") == 0)
pci_dev = pci_e1000_init(bus, nd, devfn) {
pci_register_device(bus, "e1000");
/* e1000_mmio_read & e1000_mmio_write are array of functions. for each
* array at offset 0 , e1000_mmio_readb, e1000_mmio_readw, e1000_mmio_readl
*
*
*/
d->mmio_index = cpu_register_io_memory(0, e1000_mmio_read,
e1000_mmio_write, d);
pci_register_io_region((PCIDevice *)d, 0, PNPMMIO_SIZE,
PCI_ADDRESS_SPACE_MEM, e1000_mmio_map);
pci_register_io_region((PCIDevice *)d, 1, IOPORT_SIZE,
PCI_ADDRESS_SPACE_IO, ioport_map);
}
}
}
cpu_register_io_memory() function does registers read and write function. For example
cpu_register_io_memory(e1000_mmio_read, e1000_mmio_write, d) registers e1000_mmio
read and writ as shown above.
suppose a vm exit happened due to memory read/write.
case KVM_EXIT_MMIO:
kvm_callbacks->mmio_read() {
kvm_mmio_read() {
cpu_physical_memory_rw(addr, data, len, 0) {
/* if it is a byte write function ; which is at offset 0*/
io_mem_read[io_index][0](io_mem_opaque[io_index], addr, val);
}
}
}
interrupts
before vm_enter, we will check for any interrupts available to proces and queues them.
ultimately it will set the interrupt.pending flag to true.
kvm_run() {
run->request_interrupt_window = try_push_interrupts(kvm);
}
the above try_push_interrupts(kvm) function invokes a series of function calls
try_push_interrupts(kvm) {
kvm_callbacks->try_push_interrupts() {
kvm_arch_try_push_interrupts() {
kvm_update_interrupt_request() {
/* Get irq number */
irq = cpu_get_pic_interrupt(env);
/* queue an interrupt */
r = kvm_inject_irq(kvm_context, env->cpu_index, irq) {
ioctl(kvm->vcpu_fd[vcpu], KVM_INTERRUPT, irq) {
kvm_queue_interrupt() {
vcpu->arch.interrupt.pending = true;
}
}
}
}
}
processing of interrupt
inject_pending_irq() will call kvm_x86_ops function and will go deep into vmx.c. i will cover them in vmx.c
__vcpu_run() {
r = vcpu_enter_guest(vcpu, kvm_run) {
inject_pending_irq(vcpu, kvm_run) {
kvm_x86_ops->set_irq(vcpu) {
vmx_inject_irq()
}
}
}
}
initialised in
static int __init vmx_init(void) {
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
}
we need to dig deep into vmx_x86_ops in chapter vmx.c. But in this chapter we will have at
when & why & who are calling this functions.
kvm_callbacks functions
1. /// For 8bit IO reads from the guest (Usually when executing 'inb')
int (*inb)(void *opaque, uint16_t addr, uint8_t *data);
In previous chapter, you would have seen switch case statement for run->exit_reason. Upon vm_enter, cpu
will start executing guest code on cpu directly. If it try to execute any previleged instruction or in
other words, any instruction that cannot be done in cpu vm mode, cpu will exit and will do below switch-case
statement to take care of that case.
switch (run->exit_reason) {
vm_exit happened when it try to do a IO operation. io could be a simulated device or a real one.
Both needs the vm to exit out of vm context.
case KVM_EXIT_IO:
/*
* based on size of io. this will invoke kvm->callbacks->inb() or
* kvm->callbacks->inw(), or kvm->callbacks->inl() ...and based on
* direction it may call kvm->callbacks->outb(), kvm->callbacks->outw()
* or kvm->callbacks->outl().
*/
2. /// generic memory reads to unmapped memory (For MMIO devices)
int (*mmio_read)(void *opaque, uint64_t addr, uint8_t *data,
int len);
Let us explain mmio_read by taking an example. we can emulate so many devices with
qemu system. One of them is e1000 nic. You have seen in chapter #1 , how pc_init1() is
invoked.
pc_init1() {
pci_nic_init(pci_bus, nd, -1) {
if (strcmp(nd->model, "e1000") == 0)
pci_dev = pci_e1000_init(bus, nd, devfn) {
pci_register_device(bus, "e1000");
/* e1000_mmio_read & e1000_mmio_write are array of functions. for each
* array at offset 0 , e1000_mmio_readb, e1000_mmio_readw, e1000_mmio_readl
*
*
*/
d->mmio_index = cpu_register_io_memory(0, e1000_mmio_read,
e1000_mmio_write, d);
pci_register_io_region((PCIDevice *)d, 0, PNPMMIO_SIZE,
PCI_ADDRESS_SPACE_MEM, e1000_mmio_map);
pci_register_io_region((PCIDevice *)d, 1, IOPORT_SIZE,
PCI_ADDRESS_SPACE_IO, ioport_map);
}
}
}
cpu_register_io_memory() function does registers read and write function. For example
cpu_register_io_memory(e1000_mmio_read, e1000_mmio_write, d) registers e1000_mmio
read and writ as shown above.
suppose a vm exit happened due to memory read/write.
case KVM_EXIT_MMIO:
kvm_callbacks->mmio_read() {
kvm_mmio_read() {
cpu_physical_memory_rw(addr, data, len, 0) {
/* if it is a byte write function ; which is at offset 0*/
io_mem_read[io_index][0](io_mem_opaque[io_index], addr, val);
}
}
}
interrupts
before vm_enter, we will check for any interrupts available to proces and queues them.
ultimately it will set the interrupt.pending flag to true.
kvm_run() {
run->request_interrupt_window = try_push_interrupts(kvm);
}
the above try_push_interrupts(kvm) function invokes a series of function calls
try_push_interrupts(kvm) {
kvm_callbacks->try_push_interrupts() {
kvm_arch_try_push_interrupts() {
kvm_update_interrupt_request() {
/* Get irq number */
irq = cpu_get_pic_interrupt(env);
/* queue an interrupt */
r = kvm_inject_irq(kvm_context, env->cpu_index, irq) {
ioctl(kvm->vcpu_fd[vcpu], KVM_INTERRUPT, irq) {
kvm_queue_interrupt() {
vcpu->arch.interrupt.pending = true;
}
}
}
}
}
processing of interrupt
inject_pending_irq() will call kvm_x86_ops function and will go deep into vmx.c. i will cover them in vmx.c
__vcpu_run() {
r = vcpu_enter_guest(vcpu, kvm_run) {
inject_pending_irq(vcpu, kvm_run) {
kvm_x86_ops->set_irq(vcpu) {
vmx_inject_irq()
}
}
}
}