summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--hw/apic.c3
-rw-r--r--hw/kvm/apic.c34
-rw-r--r--hw/msi.h5
-rw-r--r--hw/msix.c121
-rw-r--r--hw/msix.h6
-rw-r--r--hw/pc.c9
-rw-r--r--hw/pc_piix.c14
-rw-r--r--hw/pci.h8
-rw-r--r--hw/virtio-pci.c126
-rw-r--r--hw/virtio-pci.h6
-rw-r--r--hw/xen.h10
-rw-r--r--hw/xen_apic.c5
-rw-r--r--kvm-all.c236
-rw-r--r--kvm-stub.c23
-rw-r--r--kvm.h18
-rw-r--r--linux-headers/linux/kvm.h38
-rw-r--r--qemu-common.h1
-rwxr-xr-xscripts/kvm/vmxcap13
18 files changed, 624 insertions, 52 deletions
diff --git a/hw/apic.c b/hw/apic.c
index 4eeaf8801c..5fbf01c278 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -19,6 +19,7 @@
 #include "apic_internal.h"
 #include "apic.h"
 #include "ioapic.h"
+#include "msi.h"
 #include "host-utils.h"
 #include "trace.h"
 #include "pc.h"
@@ -862,6 +863,8 @@ static void apic_init(APICCommonState *s)
 
     s->timer = qemu_new_timer_ns(vm_clock, apic_timer, s);
     local_apics[s->idx] = s;
+
+    msi_supported = true;
 }
 
 static void apic_class_init(ObjectClass *klass, void *data)
diff --git a/hw/kvm/apic.c b/hw/kvm/apic.c
index ffe7a521b7..8ba4079025 100644
--- a/hw/kvm/apic.c
+++ b/hw/kvm/apic.c
@@ -10,6 +10,7 @@
  * See the COPYING file in the top-level directory.
  */
 #include "hw/apic_internal.h"
+#include "hw/msi.h"
 #include "kvm.h"
 
 static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic,
@@ -145,10 +146,39 @@ static void kvm_apic_external_nmi(APICCommonState *s)
     run_on_cpu(s->cpu_env, do_inject_external_nmi, s);
 }
 
+static uint64_t kvm_apic_mem_read(void *opaque, target_phys_addr_t addr,
+                                  unsigned size)
+{
+    return ~(uint64_t)0;
+}
+
+static void kvm_apic_mem_write(void *opaque, target_phys_addr_t addr,
+                               uint64_t data, unsigned size)
+{
+    MSIMessage msg = { .address = addr, .data = data };
+    int ret;
+
+    ret = kvm_irqchip_send_msi(kvm_state, msg);
+    if (ret < 0) {
+        fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n",
+                strerror(-ret));
+    }
+}
+
+static const MemoryRegionOps kvm_apic_io_ops = {
+    .read = kvm_apic_mem_read,
+    .write = kvm_apic_mem_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
 static void kvm_apic_init(APICCommonState *s)
 {
-    memory_region_init_reservation(&s->io_memory, "kvm-apic-msi",
-                                   MSI_SPACE_SIZE);
+    memory_region_init_io(&s->io_memory, &kvm_apic_io_ops, s, "kvm-apic-msi",
+                          MSI_SPACE_SIZE);
+
+    if (kvm_has_gsi_routing()) {
+        msi_supported = true;
+    }
 }
 
 static void kvm_apic_class_init(ObjectClass *klass, void *data)
diff --git a/hw/msi.h b/hw/msi.h
index 3040bb0b43..75747abc25 100644
--- a/hw/msi.h
+++ b/hw/msi.h
@@ -24,6 +24,11 @@
 #include "qemu-common.h"
 #include "pci.h"
 
+struct MSIMessage {
+    uint64_t address;
+    uint32_t data;
+};
+
 extern bool msi_supported;
 
 bool msi_enabled(const PCIDevice *dev);
diff --git a/hw/msix.c b/hw/msix.c
index 3835eaaf28..59c7a8388f 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -35,6 +35,15 @@
 #define MSIX_PAGE_PENDING (MSIX_PAGE_SIZE / 2)
 #define MSIX_MAX_ENTRIES 32
 
+static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
+{
+    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
+    MSIMessage msg;
+
+    msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
+    msg.data = pci_get_long(table_entry + PCI_MSIX_ENTRY_DATA);
+    return msg;
+}
 
 /* Add MSI-X capability to the config space for the device. */
 /* Given a bar and its size, add MSI-X table on top of it
@@ -130,13 +139,34 @@ static bool msix_is_masked(PCIDevice *dev, int vector)
     return msix_vector_masked(dev, vector, dev->msix_function_masked);
 }
 
+static void msix_fire_vector_notifier(PCIDevice *dev,
+                                      unsigned int vector, bool is_masked)
+{
+    MSIMessage msg;
+    int ret;
+
+    if (!dev->msix_vector_use_notifier) {
+        return;
+    }
+    if (is_masked) {
+        dev->msix_vector_release_notifier(dev, vector);
+    } else {
+        msg = msix_get_message(dev, vector);
+        ret = dev->msix_vector_use_notifier(dev, vector, msg);
+        assert(ret >= 0);
+    }
+}
+
 static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked)
 {
     bool is_masked = msix_is_masked(dev, vector);
+
     if (is_masked == was_masked) {
         return;
     }
 
+    msix_fire_vector_notifier(dev, vector, is_masked);
+
     if (!is_masked && msix_is_pending(dev, vector)) {
         msix_clr_pending(dev, vector);
         msix_notify(dev, vector);
@@ -222,10 +252,14 @@ static void msix_mmio_setup(PCIDevice *d, MemoryRegion *bar)
 static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
 {
     int vector;
+
     for (vector = 0; vector < nentries; ++vector) {
         unsigned offset =
             vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
+        bool was_masked = msix_is_masked(dev, vector);
+
         dev->msix_table_page[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
+        msix_handle_mask_update(dev, vector, was_masked);
     }
 }
 
@@ -317,6 +351,7 @@ void msix_save(PCIDevice *dev, QEMUFile *f)
 void msix_load(PCIDevice *dev, QEMUFile *f)
 {
     unsigned n = dev->msix_entries_nr;
+    unsigned int vector;
 
     if (!(dev->cap_present & QEMU_PCI_CAP_MSIX)) {
         return;
@@ -326,6 +361,10 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
     qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
     qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
     msix_update_function_masked(dev);
+
+    for (vector = 0; vector < n; vector++) {
+        msix_handle_mask_update(dev, vector, true);
+    }
 }
 
 /* Does device support MSI-X? */
@@ -352,9 +391,7 @@ uint32_t msix_bar_size(PCIDevice *dev)
 /* Send an MSI-X message */
 void msix_notify(PCIDevice *dev, unsigned vector)
 {
-    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
-    uint64_t address;
-    uint32_t data;
+    MSIMessage msg;
 
     if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
         return;
@@ -363,9 +400,9 @@ void msix_notify(PCIDevice *dev, unsigned vector)
         return;
     }
 
-    address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
-    data = pci_get_long(table_entry + PCI_MSIX_ENTRY_DATA);
-    stl_le_phys(address, data);
+    msg = msix_get_message(dev, vector);
+
+    stl_le_phys(msg.address, msg.data);
 }
 
 void msix_reset(PCIDevice *dev)
@@ -414,3 +451,75 @@ void msix_unuse_all_vectors(PCIDevice *dev)
         return;
     msix_free_irq_entries(dev);
 }
+
+unsigned int msix_nr_vectors_allocated(const PCIDevice *dev)
+{
+    return dev->msix_entries_nr;
+}
+
+static int msix_set_notifier_for_vector(PCIDevice *dev, unsigned int vector)
+{
+    MSIMessage msg;
+
+    if (msix_is_masked(dev, vector)) {
+        return 0;
+    }
+    msg = msix_get_message(dev, vector);
+    return dev->msix_vector_use_notifier(dev, vector, msg);
+}
+
+static void msix_unset_notifier_for_vector(PCIDevice *dev, unsigned int vector)
+{
+    if (msix_is_masked(dev, vector)) {
+        return;
+    }
+    dev->msix_vector_release_notifier(dev, vector);
+}
+
+int msix_set_vector_notifiers(PCIDevice *dev,
+                              MSIVectorUseNotifier use_notifier,
+                              MSIVectorReleaseNotifier release_notifier)
+{
+    int vector, ret;
+
+    assert(use_notifier && release_notifier);
+
+    dev->msix_vector_use_notifier = use_notifier;
+    dev->msix_vector_release_notifier = release_notifier;
+
+    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
+        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
+        for (vector = 0; vector < dev->msix_entries_nr; vector++) {
+            ret = msix_set_notifier_for_vector(dev, vector);
+            if (ret < 0) {
+                goto undo;
+            }
+        }
+    }
+    return 0;
+
+undo:
+    while (--vector >= 0) {
+        msix_unset_notifier_for_vector(dev, vector);
+    }
+    dev->msix_vector_use_notifier = NULL;
+    dev->msix_vector_release_notifier = NULL;
+    return ret;
+}
+
+void msix_unset_vector_notifiers(PCIDevice *dev)
+{
+    int vector;
+
+    assert(dev->msix_vector_use_notifier &&
+           dev->msix_vector_release_notifier);
+
+    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
+        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
+        for (vector = 0; vector < dev->msix_entries_nr; vector++) {
+            msix_unset_notifier_for_vector(dev, vector);
+        }
+    }
+    dev->msix_vector_use_notifier = NULL;
+    dev->msix_vector_release_notifier = NULL;
+}
diff --git a/hw/msix.h b/hw/msix.h
index 5aba22b858..50aee8221a 100644
--- a/hw/msix.h
+++ b/hw/msix.h
@@ -13,6 +13,8 @@ void msix_write_config(PCIDevice *pci_dev, uint32_t address,
 
 int msix_uninit(PCIDevice *d, MemoryRegion *bar);
 
+unsigned int msix_nr_vectors_allocated(const PCIDevice *dev);
+
 void msix_save(PCIDevice *dev, QEMUFile *f);
 void msix_load(PCIDevice *dev, QEMUFile *f);
 
@@ -29,4 +31,8 @@ void msix_notify(PCIDevice *dev, unsigned vector);
 
 void msix_reset(PCIDevice *dev);
 
+int msix_set_vector_notifiers(PCIDevice *dev,
+                              MSIVectorUseNotifier use_notifier,
+                              MSIVectorReleaseNotifier release_notifier);
+void msix_unset_vector_notifiers(PCIDevice *dev);
 #endif
diff --git a/hw/pc.c b/hw/pc.c
index e81a06c161..c790bcbfd7 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -912,15 +912,6 @@ static DeviceState *apic_init(void *env, uint8_t apic_id)
         apic_mapped = 1;
     }
 
-    /* KVM does not support MSI yet. */
-    if (!kvm_irqchip_in_kernel()) {
-        msi_supported = true;
-    }
-
-    if (xen_msi_support()) {
-        msi_supported = true;
-    }
-
     return dev;
 }
 
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index a7aad4b022..f49b0aaf89 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -56,31 +56,27 @@ static void kvm_piix3_setup_irq_routing(bool pci_enabled)
 {
 #ifdef CONFIG_KVM
     KVMState *s = kvm_state;
-    int ret, i;
+    int i;
 
     if (kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
         for (i = 0; i < 8; ++i) {
             if (i == 2) {
                 continue;
             }
-            kvm_irqchip_add_route(s, i, KVM_IRQCHIP_PIC_MASTER, i);
+            kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_MASTER, i);
         }
         for (i = 8; i < 16; ++i) {
-            kvm_irqchip_add_route(s, i, KVM_IRQCHIP_PIC_SLAVE, i - 8);
+            kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_PIC_SLAVE, i - 8);
         }
         if (pci_enabled) {
             for (i = 0; i < 24; ++i) {
                 if (i == 0) {
-                    kvm_irqchip_add_route(s, i, KVM_IRQCHIP_IOAPIC, 2);
+                    kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, 2);
                 } else if (i != 2) {
-                    kvm_irqchip_add_route(s, i, KVM_IRQCHIP_IOAPIC, i);
+                    kvm_irqchip_add_irq_route(s, i, KVM_IRQCHIP_IOAPIC, i);
                 }
             }
         }
-        ret = kvm_irqchip_commit_routes(s);
-        if (ret < 0) {
-            hw_error("KVM IRQ routing setup failed");
-        }
     }
 #endif /* CONFIG_KVM */
 }
diff --git a/hw/pci.h b/hw/pci.h
index 8d0aa498e5..c3cacce046 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -173,6 +173,10 @@ typedef struct PCIDeviceClass {
     const char *romfile;
 } PCIDeviceClass;
 
+typedef int (*MSIVectorUseNotifier)(PCIDevice *dev, unsigned int vector,
+                                      MSIMessage msg);
+typedef void (*MSIVectorReleaseNotifier)(PCIDevice *dev, unsigned int vector);
+
 struct PCIDevice {
     DeviceState qdev;
     /* PCI config space */
@@ -243,6 +247,10 @@ struct PCIDevice {
     bool has_rom;
     MemoryRegion rom;
     uint32_t rom_bar;
+
+    /* MSI-X notifiers */
+    MSIVectorUseNotifier msix_vector_use_notifier;
+    MSIVectorReleaseNotifier msix_vector_release_notifier;
 };
 
 void pci_register_bar(PCIDevice *pci_dev, int region_num,
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 79b86f1aad..d08c1590d2 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -24,6 +24,7 @@
 #include "virtio-scsi.h"
 #include "pci.h"
 #include "qemu-error.h"
+#include "msi.h"
 #include "msix.h"
 #include "net.h"
 #include "loader.h"
@@ -539,6 +540,107 @@ static void virtio_pci_guest_notifier_read(void *opaque)
     }
 }
 
+static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
+                                        unsigned int queue_no,
+                                        unsigned int vector,
+                                        MSIMessage msg)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
+    VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+    int fd, ret;
+
+    fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vq));
+
+    if (irqfd->users == 0) {
+        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
+        if (ret < 0) {
+            return ret;
+        }
+        irqfd->virq = ret;
+    }
+    irqfd->users++;
+
+    ret = kvm_irqchip_add_irqfd(kvm_state, fd, irqfd->virq);
+    if (ret < 0) {
+        if (--irqfd->users == 0) {
+            kvm_irqchip_release_virq(kvm_state, irqfd->virq);
+        }
+        return ret;
+    }
+
+    qemu_set_fd_handler(fd, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy,
+                                             unsigned int queue_no,
+                                             unsigned int vector)
+{
+    VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
+    VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+    int fd, ret;
+
+    fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vq));
+
+    ret = kvm_irqchip_remove_irqfd(kvm_state, fd, irqfd->virq);
+    assert(ret == 0);
+
+    if (--irqfd->users == 0) {
+        kvm_irqchip_release_virq(kvm_state, irqfd->virq);
+    }
+
+    qemu_set_fd_handler(fd, virtio_pci_guest_notifier_read, NULL, vq);
+}
+
+static int kvm_virtio_pci_vector_use(PCIDevice *dev, unsigned vector,
+                                     MSIMessage msg)
+{
+    VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev);
+    VirtIODevice *vdev = proxy->vdev;
+    int ret, queue_no;
+
+    for (queue_no = 0; queue_no < VIRTIO_PCI_QUEUE_MAX; queue_no++) {
+        if (!virtio_queue_get_num(vdev, queue_no)) {
+            break;
+        }
+        if (virtio_queue_vector(vdev, queue_no) != vector) {
+            continue;
+        }
+        ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg);
+        if (ret < 0) {
+            goto undo;
+        }
+    }
+    return 0;
+
+undo:
+    while (--queue_no >= 0) {
+        if (virtio_queue_vector(vdev, queue_no) != vector) {
+            continue;
+        }
+        kvm_virtio_pci_vq_vector_release(proxy, queue_no, vector);
+    }
+    return ret;
+}
+
+static void kvm_virtio_pci_vector_release(PCIDevice *dev, unsigned vector)
+{
+    VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev);
+    VirtIODevice *vdev = proxy->vdev;
+    int queue_no;
+
+    for (queue_no = 0; queue_no < VIRTIO_PCI_QUEUE_MAX; queue_no++) {
+        if (!virtio_queue_get_num(vdev, queue_no)) {
+            break;
+        }
+        if (virtio_queue_vector(vdev, queue_no) != vector) {
+            continue;
+        }
+        kvm_virtio_pci_vq_vector_release(proxy, queue_no, vector);
+    }
+}
+
 static int virtio_pci_set_guest_notifier(void *opaque, int n, bool assign)
 {
     VirtIOPCIProxy *proxy = opaque;
@@ -555,6 +657,9 @@ static int virtio_pci_set_guest_notifier(void *opaque, int n, bool assign)
     } else {
         qemu_set_fd_handler(event_notifier_get_fd(notifier),
                             NULL, NULL, NULL);
+        /* Test and clear notifier before closing it,
+         * in case poll callback didn't have time to run. */
+        virtio_pci_guest_notifier_read(vq);
         event_notifier_cleanup(notifier);
     }
 
@@ -573,6 +678,13 @@ static int virtio_pci_set_guest_notifiers(void *opaque, bool assign)
     VirtIODevice *vdev = proxy->vdev;
     int r, n;
 
+    /* Must unset vector notifier while guest notifier is still assigned */
+    if (kvm_irqchip_in_kernel() && !assign) {
+        msix_unset_vector_notifiers(&proxy->pci_dev);
+        g_free(proxy->vector_irqfd);
+        proxy->vector_irqfd = NULL;
+    }
+
     for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) {
         if (!virtio_queue_get_num(vdev, n)) {
             break;
@@ -584,10 +696,24 @@ static int virtio_pci_set_guest_notifiers(void *opaque, bool assign)
         }
     }
 
+    /* Must set vector notifier after guest notifier has been assigned */
+    if (kvm_irqchip_in_kernel() && assign) {
+        proxy->vector_irqfd =
+            g_malloc0(sizeof(*proxy->vector_irqfd) *
+                      msix_nr_vectors_allocated(&proxy->pci_dev));
+        r = msix_set_vector_notifiers(&proxy->pci_dev,
+                                      kvm_virtio_pci_vector_use,
+                                      kvm_virtio_pci_vector_release);
+        if (r < 0) {
+            goto assign_error;
+        }
+    }
+
     return 0;
 
 assign_error:
     /* We get here on assignment failure. Recover by undoing for VQs 0 .. n. */
+    assert(assign);
     while (--n >= 0) {
         virtio_pci_set_guest_notifier(opaque, n, !assign);
     }
diff --git a/hw/virtio-pci.h b/hw/virtio-pci.h
index 889e59e421..91b791ba9d 100644
--- a/hw/virtio-pci.h
+++ b/hw/virtio-pci.h
@@ -26,6 +26,11 @@
 #define VIRTIO_PCI_FLAG_USE_IOEVENTFD   (1 << VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT)
 
 typedef struct {
+    int virq;
+    unsigned int users;
+} VirtIOIRQFD;
+
+typedef struct {
     PCIDevice pci_dev;
     VirtIODevice *vdev;
     MemoryRegion bar;
@@ -44,6 +49,7 @@ typedef struct {
     VirtIOSCSIConf scsi;
     bool ioeventfd_disabled;
     bool ioeventfd_started;
+    VirtIOIRQFD *vector_irqfd;
 } VirtIOPCIProxy;
 
 void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev);
diff --git a/hw/xen.h b/hw/xen.h
index 3ae4cd0f5c..e5926b7b8a 100644
--- a/hw/xen.h
+++ b/hw/xen.h
@@ -57,14 +57,4 @@ void xen_register_framebuffer(struct MemoryRegion *mr);
 #  define HVM_MAX_VCPUS 32
 #endif
 
-static inline int xen_msi_support(void)
-{
-#if defined(CONFIG_XEN_CTRL_INTERFACE_VERSION) \
-    && CONFIG_XEN_CTRL_INTERFACE_VERSION >= 420
-    return xen_enabled();
-#else
-    return 0;
-#endif
-}
-
 #endif /* QEMU_HW_XEN_H */
diff --git a/hw/xen_apic.c b/hw/xen_apic.c
index 1725ff67dd..a9e101f315 100644
--- a/hw/xen_apic.c
+++ b/hw/xen_apic.c
@@ -40,6 +40,11 @@ static void xen_apic_init(APICCommonState *s)
 {
     memory_region_init_io(&s->io_memory, &xen_apic_io_ops, s, "xen-apic-msi",
                           MSI_SPACE_SIZE);
+
+#if defined(CONFIG_XEN_CTRL_INTERFACE_VERSION) \
+    && CONFIG_XEN_CTRL_INTERFACE_VERSION >= 420
+    msi_supported = true;
+#endif
 }
 
 static void xen_apic_set_base(APICCommonState *s, uint64_t val)
diff --git a/kvm-all.c b/kvm-all.c
index 9b73ccfbec..489ee53ad2 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -24,6 +24,7 @@
 #include "qemu-barrier.h"
 #include "sysemu.h"
 #include "hw/hw.h"
+#include "hw/msi.h"
 #include "gdbstub.h"
 #include "kvm.h"
 #include "bswap.h"
@@ -48,6 +49,8 @@
     do { } while (0)
 #endif
 
+#define KVM_MSI_HASHTAB_SIZE    256
+
 typedef struct KVMSlot
 {
     target_phys_addr_t start_addr;
@@ -59,6 +62,11 @@ typedef struct KVMSlot
 
 typedef struct kvm_dirty_log KVMDirtyLog;
 
+typedef struct KVMMSIRoute {
+    struct kvm_irq_routing_entry kroute;
+    QTAILQ_ENTRY(KVMMSIRoute) entry;
+} KVMMSIRoute;
+
 struct KVMState
 {
     KVMSlot slots[32];
@@ -86,7 +94,9 @@ struct KVMState
     struct kvm_irq_routing *irq_routes;
     int nr_allocated_irq_routes;
     uint32_t *used_gsi_bitmap;
-    unsigned int max_gsi;
+    unsigned int gsi_count;
+    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
+    bool direct_msi;
 #endif
 };
 
@@ -859,14 +869,17 @@ int kvm_irqchip_set_irq(KVMState *s, int irq, int level)
 #ifdef KVM_CAP_IRQ_ROUTING
 static void set_gsi(KVMState *s, unsigned int gsi)
 {
-    assert(gsi < s->max_gsi);
-
     s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
 }
 
+static void clear_gsi(KVMState *s, unsigned int gsi)
+{
+    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
+}
+
 static void kvm_init_irq_routing(KVMState *s)
 {
-    int gsi_count;
+    int gsi_count, i;
 
     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
     if (gsi_count > 0) {
@@ -875,7 +888,7 @@ static void kvm_init_irq_routing(KVMState *s)
         /* Round up so we can search ints using ffs */
         gsi_bits = ALIGN(gsi_count, 32);
         s->used_gsi_bitmap = g_malloc0(gsi_bits / 8);
-        s->max_gsi = gsi_bits;
+        s->gsi_count = gsi_count;
 
         /* Mark any over-allocated bits as already in use */
         for (i = gsi_count; i < gsi_bits; i++) {
@@ -886,9 +899,24 @@ static void kvm_init_irq_routing(KVMState *s)
     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
     s->nr_allocated_irq_routes = 0;
 
+    if (!s->direct_msi) {
+        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
+            QTAILQ_INIT(&s->msi_hashtab[i]);
+        }
+    }
+
     kvm_arch_init_irq_routing(s);
 }
 
+static void kvm_irqchip_commit_routes(KVMState *s)
+{
+    int ret;
+
+    s->irq_routes->flags = 0;
+    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
+    assert(ret == 0);
+}
+
 static void kvm_add_routing_entry(KVMState *s,
                                   struct kvm_irq_routing_entry *entry)
 {
@@ -914,12 +942,16 @@ static void kvm_add_routing_entry(KVMState *s,
     new->u = entry->u;
 
     set_gsi(s, entry->gsi);
+
+    kvm_irqchip_commit_routes(s);
 }
 
-void kvm_irqchip_add_route(KVMState *s, int irq, int irqchip, int pin)
+void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
 {
     struct kvm_irq_routing_entry e;
 
+    assert(pin < s->gsi_count);
+
     e.gsi = irq;
     e.type = KVM_IRQ_ROUTING_IRQCHIP;
     e.flags = 0;
@@ -928,10 +960,167 @@ void kvm_irqchip_add_route(KVMState *s, int irq, int irqchip, int pin)
     kvm_add_routing_entry(s, &e);
 }
 
-int kvm_irqchip_commit_routes(KVMState *s)
+void kvm_irqchip_release_virq(KVMState *s, int virq)
 {
-    s->irq_routes->flags = 0;
-    return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
+    struct kvm_irq_routing_entry *e;
+    int i;
+
+    for (i = 0; i < s->irq_routes->nr; i++) {
+        e = &s->irq_routes->entries[i];
+        if (e->gsi == virq) {
+            s->irq_routes->nr--;
+            *e = s->irq_routes->entries[s->irq_routes->nr];
+        }
+    }
+    clear_gsi(s, virq);
+
+    kvm_irqchip_commit_routes(s);
+}
+
+static unsigned int kvm_hash_msi(uint32_t data)
+{
+    /* This is optimized for IA32 MSI layout. However, no other arch shall
+     * repeat the mistake of not providing a direct MSI injection API. */
+    return data & 0xff;
+}
+
+static void kvm_flush_dynamic_msi_routes(KVMState *s)
+{
+    KVMMSIRoute *route, *next;
+    unsigned int hash;
+
+    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
+        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
+            kvm_irqchip_release_virq(s, route->kroute.gsi);
+            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
+            g_free(route);
+        }
+    }
+}
+
+static int kvm_irqchip_get_virq(KVMState *s)
+{
+    uint32_t *word = s->used_gsi_bitmap;
+    int max_words = ALIGN(s->gsi_count, 32) / 32;
+    int i, bit;
+    bool retry = true;
+
+again:
+    /* Return the lowest unused GSI in the bitmap */
+    for (i = 0; i < max_words; i++) {
+        bit = ffs(~word[i]);
+        if (!bit) {
+            continue;
+        }
+
+        return bit - 1 + i * 32;
+    }
+    if (!s->direct_msi && retry) {
+        retry = false;
+        kvm_flush_dynamic_msi_routes(s);
+        goto again;
+    }
+    return -ENOSPC;
+
+}
+
+static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
+{
+    unsigned int hash = kvm_hash_msi(msg.data);
+    KVMMSIRoute *route;
+
+    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
+        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
+            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
+            route->kroute.u.msi.data == msg.data) {
+            return route;
+        }
+    }
+    return NULL;
+}
+
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
+{
+    struct kvm_msi msi;
+    KVMMSIRoute *route;
+
+    if (s->direct_msi) {
+        msi.address_lo = (uint32_t)msg.address;
+        msi.address_hi = msg.address >> 32;
+        msi.data = msg.data;
+        msi.flags = 0;
+        memset(msi.pad, 0, sizeof(msi.pad));
+
+        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
+    }
+
+    route = kvm_lookup_msi_route(s, msg);
+    if (!route) {
+        int virq;
+
+        virq = kvm_irqchip_get_virq(s);
+        if (virq < 0) {
+            return virq;
+        }
+
+        route = g_malloc(sizeof(KVMMSIRoute));
+        route->kroute.gsi = virq;
+        route->kroute.type = KVM_IRQ_ROUTING_MSI;
+        route->kroute.flags = 0;
+        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
+        route->kroute.u.msi.address_hi = msg.address >> 32;
+        route->kroute.u.msi.data = msg.data;
+
+        kvm_add_routing_entry(s, &route->kroute);
+
+        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
+                           entry);
+    }
+
+    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
+
+    return kvm_irqchip_set_irq(s, route->kroute.gsi, 1);
+}
+
+int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
+{
+    struct kvm_irq_routing_entry kroute;
+    int virq;
+
+    if (!kvm_irqchip_in_kernel()) {
+        return -ENOSYS;
+    }
+
+    virq = kvm_irqchip_get_virq(s);
+    if (virq < 0) {
+        return virq;
+    }
+
+    kroute.gsi = virq;
+    kroute.type = KVM_IRQ_ROUTING_MSI;
+    kroute.flags = 0;
+    kroute.u.msi.address_lo = (uint32_t)msg.address;
+    kroute.u.msi.address_hi = msg.address >> 32;
+    kroute.u.msi.data = msg.data;
+
+    kvm_add_routing_entry(s, &kroute);
+
+    return virq;
+}
+
+static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
+{
+    struct kvm_irqfd irqfd = {
+        .fd = fd,
+        .gsi = virq,
+        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
+    };
+
+    if (!kvm_irqchip_in_kernel()) {
+        return -ENOSYS;
+    }
+
+    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
 }
 
 #else /* !KVM_CAP_IRQ_ROUTING */
@@ -939,8 +1128,33 @@ int kvm_irqchip_commit_routes(KVMState *s)
 static void kvm_init_irq_routing(KVMState *s)
 {
 }
+
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
+{
+    abort();
+}
+
+int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
+{
+    abort();
+}
+
+static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
+{
+    abort();
+}
 #endif /* !KVM_CAP_IRQ_ROUTING */
 
+int kvm_irqchip_add_irqfd(KVMState *s, int fd, int virq)
+{
+    return kvm_irqchip_assign_irqfd(s, fd, virq, true);
+}
+
+int kvm_irqchip_remove_irqfd(KVMState *s, int fd, int virq)
+{
+    return kvm_irqchip_assign_irqfd(s, fd, virq, false);
+}
+
 static int kvm_irqchip_create(KVMState *s)
 {
     QemuOptsList *list = qemu_find_opts("machine");
@@ -948,7 +1162,7 @@ static int kvm_irqchip_create(KVMState *s)
 
     if (QTAILQ_EMPTY(&list->head) ||
         !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
-                           "kernel_irqchip", false) ||
+                           "kernel_irqchip", true) ||
         !kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
         return 0;
     }
@@ -1072,6 +1286,8 @@ int kvm_init(void)
     s->pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
 #endif
 
+    s->direct_msi = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
+
     ret = kvm_arch_init(s);
     if (ret < 0) {
         goto err;
diff --git a/kvm-stub.c b/kvm-stub.c
index 47c573d6f3..ec9a36454d 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -12,10 +12,14 @@
 
 #include "qemu-common.h"
 #include "hw/hw.h"
+#include "hw/msi.h"
 #include "cpu.h"
 #include "gdbstub.h"
 #include "kvm.h"
 
+KVMState *kvm_state;
+bool kvm_kernel_irqchip;
+
 int kvm_init_vcpu(CPUArchState *env)
 {
     return -ENOSYS;
@@ -128,3 +132,22 @@ int kvm_on_sigbus(int code, void *addr)
 {
     return 1;
 }
+
+int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
+{
+    return -ENOSYS;
+}
+
+void kvm_irqchip_release_virq(KVMState *s, int virq)
+{
+}
+
+int kvm_irqchip_add_irqfd(KVMState *s, int fd, int virq)
+{
+    return -ENOSYS;
+}
+
+int kvm_irqchip_remove_irqfd(KVMState *s, int fd, int virq)
+{
+    return -ENOSYS;
+}
diff --git a/kvm.h b/kvm.h
index 4ccae8c0c8..9c7b0ea6ae 100644
--- a/kvm.h
+++ b/kvm.h
@@ -44,6 +44,10 @@ typedef struct KVMCapabilityInfo {
 #define KVM_CAP_INFO(CAP) { "KVM_CAP_" stringify(CAP), KVM_CAP_##CAP }
 #define KVM_CAP_LAST_INFO { NULL, 0 }
 
+struct KVMState;
+typedef struct KVMState KVMState;
+extern KVMState *kvm_state;
+
 /* external API */
 
 int kvm_init(void);
@@ -88,10 +92,6 @@ int kvm_on_sigbus(int code, void *addr);
 
 /* internal API */
 
-struct KVMState;
-typedef struct KVMState KVMState;
-extern KVMState *kvm_state;
-
 int kvm_ioctl(KVMState *s, int type, ...);
 
 int kvm_vm_ioctl(KVMState *s, int type, ...);
@@ -132,9 +132,9 @@ int kvm_arch_on_sigbus(int code, void *addr);
 void kvm_arch_init_irq_routing(KVMState *s);
 
 int kvm_irqchip_set_irq(KVMState *s, int irq, int level);
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg);
 
-void kvm_irqchip_add_route(KVMState *s, int gsi, int irqchip, int pin);
-int kvm_irqchip_commit_routes(KVMState *s);
+void kvm_irqchip_add_irq_route(KVMState *s, int gsi, int irqchip, int pin);
 
 void kvm_put_apic_state(DeviceState *d, struct kvm_lapic_state *kapic);
 void kvm_get_apic_state(DeviceState *d, struct kvm_lapic_state *kapic);
@@ -212,4 +212,10 @@ int kvm_set_ioeventfd_mmio(int fd, uint32_t adr, uint32_t val, bool assign,
                            uint32_t size);
 
 int kvm_set_ioeventfd_pio_word(int fd, uint16_t adr, uint16_t val, bool assign);
+
+int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg);
+void kvm_irqchip_release_virq(KVMState *s, int virq);
+
+int kvm_irqchip_add_irqfd(KVMState *s, int fd, int virq);
+int kvm_irqchip_remove_irqfd(KVMState *s, int fd, int virq);
 #endif
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index ee7bd9cc32..c4426ec73d 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -449,6 +449,30 @@ struct kvm_ppc_pvinfo {
 	__u8  pad[108];
 };
 
+/* for KVM_PPC_GET_SMMU_INFO */
+#define KVM_PPC_PAGE_SIZES_MAX_SZ	8
+
+struct kvm_ppc_one_page_size {
+	__u32 page_shift;	/* Page shift (or 0) */
+	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
+};
+
+struct kvm_ppc_one_seg_page_size {
+	__u32 page_shift;	/* Base page shift of segment (or 0) */
+	__u32 slb_enc;		/* SLB encoding for BookS */
+	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+#define KVM_PPC_PAGE_SIZES_REAL		0x00000001
+#define KVM_PPC_1T_SEGMENTS		0x00000002
+
+struct kvm_ppc_smmu_info {
+	__u64 flags;
+	__u32 slb_size;
+	__u32 pad;
+	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
 #define KVMIO 0xAE
 
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -590,6 +614,8 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_SYNC_REGS 74
 #define KVM_CAP_PCI_2_3 75
 #define KVM_CAP_KVMCLOCK_CTRL 76
+#define KVM_CAP_SIGNAL_MSI 77
+#define KVM_CAP_PPC_GET_SMMU_INFO 78
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -715,6 +741,14 @@ struct kvm_one_reg {
 	__u64 addr;
 };
 
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u8  pad[16];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -789,6 +823,10 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PCI_2_3 */
 #define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
 				       struct kvm_assigned_pci_dev)
+/* Available with KVM_CAP_SIGNAL_MSI */
+#define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
+/* Available with KVM_CAP_PPC_GET_SMMU_INFO */
+#define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
 
 /*
  * ioctls for vcpu fds
diff --git a/qemu-common.h b/qemu-common.h
index cccfb42dd6..91e056296d 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -251,6 +251,7 @@ typedef struct PCIEAERLog PCIEAERLog;
 typedef struct PCIEAERErr PCIEAERErr;
 typedef struct PCIEPort PCIEPort;
 typedef struct PCIESlot PCIESlot;
+typedef struct MSIMessage MSIMessage;
 typedef struct SerialState SerialState;
 typedef struct IRQState *qemu_irq;
 typedef struct PCMCIACardState PCMCIACardState;
diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap
index a74ce71917..cbe6440ba3 100755
--- a/scripts/kvm/vmxcap
+++ b/scripts/kvm/vmxcap
@@ -22,6 +22,7 @@ MSR_IA32_VMX_TRUE_PINBASED_CTLS = 0x48D
 MSR_IA32_VMX_TRUE_PROCBASED_CTLS = 0x48E
 MSR_IA32_VMX_TRUE_EXIT_CTLS = 0x48F
 MSR_IA32_VMX_TRUE_ENTRY_CTLS = 0x490
+MSR_IA32_VMX_VMFUNC = 0x491
 
 class msr(object):
     def __init__(self):
@@ -147,6 +148,9 @@ controls = [
             6: 'WBINVD exiting',
             7: 'Unrestricted guest',
             10: 'PAUSE-loop exiting',
+            11: 'RDRAND exiting',
+            12: 'Enable INVPCID',
+            13: 'Enable VM functions',
             },
         cap_msr = MSR_IA32_VMX_PROCBASED_CTLS2,
         ),
@@ -193,6 +197,7 @@ controls = [
             8: 'Wait-for-SIPI activity state',
             (16,24): 'Number of CR3-target values',
             (25,27): 'MSR-load/store count recommenation',
+            28: 'IA32_SMM_MONITOR_CTL[2] can be set to 1',
             (32,62): 'MSEG revision identifier',
             },
         msr = MSR_IA32_VMX_MISC_CTLS,
@@ -208,6 +213,7 @@ controls = [
             16: '2MB EPT pages',
             17: '1GB EPT pages',
             20: 'INVEPT supported',
+            21: 'EPT accessed and dirty flags',
             25: 'Single-context INVEPT',
             26: 'All-context INVEPT',
             32: 'INVVPID supported',
@@ -218,6 +224,13 @@ controls = [
             },
         msr = MSR_IA32_VMX_EPT_VPID_CAP,
         ),
+    Misc(
+        name = 'VM Functions',
+        bits = {
+            0: 'EPTP Switching',
+            },
+        msr = MSR_IA32_VMX_VMFUNC,
+        ),
     ]
 
 for c in controls: