Tuesday, October 7, 2014

kvm_callbacks

As i promised in previous chapter, lets have a closer look at kvm_callbacks. This is
initialised in

static int __init vmx_init(void) {
     r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
}

we need to dig deep into vmx_x86_ops in chapter vmx.c. But in this chapter we will have  at
when & why & who are calling this functions.

kvm_callbacks functions

1.  /// For 8bit IO reads from the guest (Usually when executing 'inb')
    int (*inb)(void *opaque, uint16_t addr, uint8_t *data);
   
    In previous chapter, you would have seen switch case statement for run->exit_reason. Upon vm_enter, cpu
    will start executing guest code on cpu directly. If it try to execute any previleged instruction or in
    other words, any instruction that cannot be done in cpu vm mode, cpu will exit and will do below switch-case
    statement to take care of that case.

    switch (run->exit_reason) {
          vm_exit happened when it try to do a IO operation. io could be a simulated device or a real one.

          Both needs the vm   to exit out of vm context.          
 

        case KVM_EXIT_IO:
               /*
                * based on size of io. this will invoke  kvm->callbacks->inb() or
               *  kvm->callbacks->inw(), or  kvm->callbacks->inl()  ...and based on
               * direction it may call  kvm->callbacks->outb(),  kvm->callbacks->outw()
               * or  kvm->callbacks->outl().
               */
           
              
2. /// generic memory reads to unmapped memory (For MMIO devices)
    

    int (*mmio_read)(void *opaque, uint64_t addr, uint8_t *data,
                                            int len);
                                           
    Let us explain mmio_read by taking an example. we can emulate so many devices with
    qemu system. One of them is e1000 nic. You have seen in chapter #1 , how pc_init1() is
    invoked.


    pc_init1() {
        pci_nic_init(pci_bus, nd, -1) {
                if (strcmp(nd->model, "e1000") == 0)
                     pci_dev = pci_e1000_init(bus, nd, devfn) {
                
                     pci_register_device(bus, "e1000");
                   
                    /* e1000_mmio_read & e1000_mmio_write are array of functions. for each
                     * array  at offset 0 , e1000_mmio_readb, e1000_mmio_readw, e1000_mmio_readl
                     * 
                     *
                     */
                    d->mmio_index = cpu_register_io_memory(0, e1000_mmio_read,
                                                              e1000_mmio_write, d);
  
                     pci_register_io_region((PCIDevice *)d, 0, PNPMMIO_SIZE,
                                   PCI_ADDRESS_SPACE_MEM, e1000_mmio_map);
  
                     pci_register_io_region((PCIDevice *)d, 1, IOPORT_SIZE,
                                          PCI_ADDRESS_SPACE_IO, ioport_map);
               }
        }       
   }
  

   cpu_register_io_memory() function does registers read and write function. For example
   cpu_register_io_memory(e1000_mmio_read, e1000_mmio_write, d) registers e1000_mmio 

    read and writ as shown  above.

   suppose a vm exit happened due to memory read/write.

   case  KVM_EXIT_MMIO:
   
      kvm_callbacks->mmio_read() {
             kvm_mmio_read() {
                   cpu_physical_memory_rw(addr, data, len, 0) {
                 
                    /* if it is a byte write  function ; which is at offset 0*/
                    io_mem_read[io_index][0](io_mem_opaque[io_index], addr, val);
                 
                     }
            }
   
      }
   

interrupts

before vm_enter, we will check for any interrupts available to proces and queues them.
ultimately it will set the interrupt.pending flag to true.


   kvm_run() {
       run->request_interrupt_window = try_push_interrupts(kvm);
   }


the above try_push_interrupts(kvm) function invokes a series of function calls
try_push_interrupts(kvm) {

     kvm_callbacks->try_push_interrupts() {
            kvm_arch_try_push_interrupts() {
                     kvm_update_interrupt_request() {
                      /* Get irq number */                   
                      irq = cpu_get_pic_interrupt(env);
                       
                      /* queue an  interrupt */
                      r = kvm_inject_irq(kvm_context, env->cpu_index, irq) {
                              ioctl(kvm->vcpu_fd[vcpu], KVM_INTERRUPT, irq) {
                                     kvm_queue_interrupt() {
                                              vcpu->arch.interrupt.pending = true;

                                      }                                    
                         }
                           
                   }
                                                      
         }
           
    }


processing of interrupt

inject_pending_irq() will call kvm_x86_ops function and will go deep into vmx.c.  i will cover them in vmx.c

__vcpu_run() {
       r = vcpu_enter_guest(vcpu, kvm_run) {
               inject_pending_irq(vcpu, kvm_run) {
                     kvm_x86_ops->set_irq(vcpu) {
                             vmx_inject_irq() 
                    }
               }
        } 

 }