diff options
Diffstat (limited to 'linux-headers/linux/vfio.h')
| -rw-r--r-- | linux-headers/linux/vfio.h | 278 |
1 files changed, 274 insertions, 4 deletions
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index ede44b5572..c59692ce0b 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -819,12 +819,20 @@ struct vfio_device_feature { * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P * is supported in addition to the STOP_COPY states. * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY means that + * PRE_COPY is supported in addition to the STOP_COPY states. + * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY + * means that RUNNING_P2P, PRE_COPY and PRE_COPY_P2P are supported + * in addition to the STOP_COPY states. + * * Other combinations of flags have behavior to be defined in the future. */ struct vfio_device_feature_migration { __aligned_u64 flags; #define VFIO_MIGRATION_STOP_COPY (1 << 0) #define VFIO_MIGRATION_P2P (1 << 1) +#define VFIO_MIGRATION_PRE_COPY (1 << 2) }; #define VFIO_DEVICE_FEATURE_MIGRATION 1 @@ -875,8 +883,13 @@ struct vfio_device_feature_mig_state { * RESUMING - The device is stopped and is loading a new internal state * ERROR - The device has failed and must be reset * - * And 1 optional state to support VFIO_MIGRATION_P2P: + * And optional states to support VFIO_MIGRATION_P2P: * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA + * And VFIO_MIGRATION_PRE_COPY: + * PRE_COPY - The device is running normally but tracking internal state + * changes + * And VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY: + * PRE_COPY_P2P - PRE_COPY, except the device cannot do peer to peer DMA * * The FSM takes actions on the arcs between FSM states. The driver implements * the following behavior for the FSM arcs: @@ -908,20 +921,48 @@ struct vfio_device_feature_mig_state { * * To abort a RESUMING session the device must be reset. * + * PRE_COPY -> RUNNING * RUNNING_P2P -> RUNNING * While in RUNNING the device is fully operational, the device may generate * interrupts, DMA, respond to MMIO, all vfio device regions are functional, * and the device may advance its internal state. * + * The PRE_COPY arc will terminate a data transfer session. + * + * PRE_COPY_P2P -> RUNNING_P2P * RUNNING -> RUNNING_P2P * STOP -> RUNNING_P2P * While in RUNNING_P2P the device is partially running in the P2P quiescent * state defined below. * + * The PRE_COPY_P2P arc will terminate a data transfer session. + * + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P * STOP -> STOP_COPY - * This arc begin the process of saving the device state and will return a - * new data_fd. + * PRE_COPY, PRE_COPY_P2P and STOP_COPY form the "saving group" of states + * which share a data transfer session. Moving between these states alters + * what is streamed in session, but does not terminate or otherwise affect + * the associated fd. + * + * These arcs begin the process of saving the device state and will return a + * new data_fd. The migration driver may perform actions such as enabling + * dirty logging of device state when entering PRE_COPY or PER_COPY_P2P. + * + * Each arc does not change the device operation, the device remains + * RUNNING, P2P quiesced or in STOP. The STOP_COPY state is described below + * in PRE_COPY_P2P -> STOP_COPY. + * + * PRE_COPY -> PRE_COPY_P2P + * Entering PRE_COPY_P2P continues all the behaviors of PRE_COPY above. + * However, while in the PRE_COPY_P2P state, the device is partially running + * in the P2P quiescent state defined below, like RUNNING_P2P. + * + * PRE_COPY_P2P -> PRE_COPY + * This arc allows returning the device to a full RUNNING behavior while + * continuing all the behaviors of PRE_COPY. * + * PRE_COPY_P2P -> STOP_COPY * While in the STOP_COPY state the device has the same behavior as STOP * with the addition that the data transfers session continues to stream the * migration state. End of stream on the FD indicates the entire device @@ -939,6 +980,13 @@ struct vfio_device_feature_mig_state { * device state for this arc if required to prepare the device to receive the * migration data. * + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P + * These arcs are not permitted and return error if requested. Future + * revisions of this API may define behaviors for these arcs, in this case + * support will be discoverable by a new flag in + * VFIO_DEVICE_FEATURE_MIGRATION. + * * any -> ERROR * ERROR cannot be specified as a device state, however any transition request * can be failed with an errno return and may then move the device_state into @@ -950,7 +998,7 @@ struct vfio_device_feature_mig_state { * The optional peer to peer (P2P) quiescent state is intended to be a quiescent * state for the device for the purposes of managing multiple devices within a * user context where peer-to-peer DMA between devices may be active. The - * RUNNING_P2P states must prevent the device from initiating + * RUNNING_P2P and PRE_COPY_P2P states must prevent the device from initiating * any new P2P DMA transactions. If the device can identify P2P transactions * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration * driver must complete any such outstanding operations prior to completing the @@ -963,6 +1011,8 @@ struct vfio_device_feature_mig_state { * above FSM arcs. As there are multiple paths through the FSM arcs the path * should be selected based on the following rules: * - Select the shortest path. + * - The path cannot have saving group states as interior arcs, only + * starting/end states. * Refer to vfio_mig_get_next_state() for the result of the algorithm. * * The automatic transit through the FSM arcs that make up the combination @@ -976,6 +1026,9 @@ struct vfio_device_feature_mig_state { * support them. The user can discover if these states are supported by using * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can * avoid knowing about these optional states if the kernel driver supports them. + * + * Arcs touching PRE_COPY and PRE_COPY_P2P are removed if support for PRE_COPY + * is not present. */ enum vfio_device_mig_state { VFIO_DEVICE_STATE_ERROR = 0, @@ -984,8 +1037,225 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_STOP_COPY = 3, VFIO_DEVICE_STATE_RESUMING = 4, VFIO_DEVICE_STATE_RUNNING_P2P = 5, + VFIO_DEVICE_STATE_PRE_COPY = 6, + VFIO_DEVICE_STATE_PRE_COPY_P2P = 7, +}; + +/** + * VFIO_MIG_GET_PRECOPY_INFO - _IO(VFIO_TYPE, VFIO_BASE + 21) + * + * This ioctl is used on the migration data FD in the precopy phase of the + * migration data transfer. It returns an estimate of the current data sizes + * remaining to be transferred. It allows the user to judge when it is + * appropriate to leave PRE_COPY for STOP_COPY. + * + * This ioctl is valid only in PRE_COPY states and kernel driver should + * return -EINVAL from any other migration state. + * + * The vfio_precopy_info data structure returned by this ioctl provides + * estimates of data available from the device during the PRE_COPY states. + * This estimate is split into two categories, initial_bytes and + * dirty_bytes. + * + * The initial_bytes field indicates the amount of initial precopy + * data available from the device. This field should have a non-zero initial + * value and decrease as migration data is read from the device. + * It is recommended to leave PRE_COPY for STOP_COPY only after this field + * reaches zero. Leaving PRE_COPY earlier might make things slower. + * + * The dirty_bytes field tracks device state changes relative to data + * previously retrieved. This field starts at zero and may increase as + * the internal device state is modified or decrease as that modified + * state is read from the device. + * + * Userspace may use the combination of these fields to estimate the + * potential data size available during the PRE_COPY phases, as well as + * trends relative to the rate the device is dirtying its internal + * state, but these fields are not required to have any bearing relative + * to the data size available during the STOP_COPY phase. + * + * Drivers have a lot of flexibility in when and what they transfer during the + * PRE_COPY phase, and how they report this from VFIO_MIG_GET_PRECOPY_INFO. + * + * During pre-copy the migration data FD has a temporary "end of stream" that is + * reached when both initial_bytes and dirty_byte are zero. For instance, this + * may indicate that the device is idle and not currently dirtying any internal + * state. When read() is done on this temporary end of stream the kernel driver + * should return ENOMSG from read(). Userspace can wait for more data (which may + * never come) by using poll. + * + * Once in STOP_COPY the migration data FD has a permanent end of stream + * signaled in the usual way by read() always returning 0 and poll always + * returning readable. ENOMSG may not be returned in STOP_COPY. + * Support for this ioctl is mandatory if a driver claims to support + * VFIO_MIGRATION_PRE_COPY. + * + * Return: 0 on success, -1 and errno set on failure. + */ +struct vfio_precopy_info { + __u32 argsz; + __u32 flags; + __aligned_u64 initial_bytes; + __aligned_u64 dirty_bytes; +}; + +#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21) + +/* + * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power + * state with the platform-based power management. Device use of lower power + * states depends on factors managed by the runtime power management core, + * including system level support and coordinating support among dependent + * devices. Enabling device low power entry does not guarantee lower power + * usage by the device, nor is a mechanism provided through this feature to + * know the current power state of the device. If any device access happens + * (either from the host or through the vfio uAPI) when the device is in the + * low power state, then the host will move the device out of the low power + * state as necessary prior to the access. Once the access is completed, the + * device may re-enter the low power state. For single shot low power support + * with wake-up notification, see + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP below. Access to mmap'd + * device regions is disabled on LOW_POWER_ENTRY and may only be resumed after + * calling LOW_POWER_EXIT. + */ +#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY 3 + +/* + * This device feature has the same behavior as + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY with the exception that the user + * provides an eventfd for wake-up notification. When the device moves out of + * the low power state for the wake-up, the host will not allow the device to + * re-enter a low power state without a subsequent user call to one of the low + * power entry device feature IOCTLs. Access to mmap'd device regions is + * disabled on LOW_POWER_ENTRY_WITH_WAKEUP and may only be resumed after the + * low power exit. The low power exit can happen either through LOW_POWER_EXIT + * or through any other access (where the wake-up notification has been + * generated). The access to mmap'd device regions will not trigger low power + * exit. + * + * The notification through the provided eventfd will be generated only when + * the device has entered and is resumed from a low power state after + * calling this device feature IOCTL. A device that has not entered low power + * state, as managed through the runtime power management core, will not + * generate a notification through the provided eventfd on access. Calling the + * LOW_POWER_EXIT feature is optional in the case where notification has been + * signaled on the provided eventfd that a resume from low power has occurred. + */ +struct vfio_device_low_power_entry_with_wakeup { + __s32 wakeup_eventfd; + __u32 reserved; +}; + +#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP 4 + +/* + * Upon VFIO_DEVICE_FEATURE_SET, disallow use of device low power states as + * previously enabled via VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY or + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP device features. + * This device feature IOCTL may itself generate a wakeup eventfd notification + * in the latter case if the device had previously entered a low power state. + */ +#define VFIO_DEVICE_FEATURE_LOW_POWER_EXIT 5 + +/* + * Upon VFIO_DEVICE_FEATURE_SET start/stop device DMA logging. + * VFIO_DEVICE_FEATURE_PROBE can be used to detect if the device supports + * DMA logging. + * + * DMA logging allows a device to internally record what DMAs the device is + * initiating and report them back to userspace. It is part of the VFIO + * migration infrastructure that allows implementing dirty page tracking + * during the pre copy phase of live migration. Only DMA WRITEs are logged, + * and this API is not connected to VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. + * + * When DMA logging is started a range of IOVAs to monitor is provided and the + * device can optimize its logging to cover only the IOVA range given. Each + * DMA that the device initiates inside the range will be logged by the device + * for later retrieval. + * + * page_size is an input that hints what tracking granularity the device + * should try to achieve. If the device cannot do the hinted page size then + * it's the driver choice which page size to pick based on its support. + * On output the device will return the page size it selected. + * + * ranges is a pointer to an array of + * struct vfio_device_feature_dma_logging_range. + * + * The core kernel code guarantees to support by minimum num_ranges that fit + * into a single kernel page. User space can try higher values but should give + * up if the above can't be achieved as of some driver limitations. + * + * A single call to start device DMA logging can be issued and a matching stop + * should follow at the end. Another start is not allowed in the meantime. + */ +struct vfio_device_feature_dma_logging_control { + __aligned_u64 page_size; + __u32 num_ranges; + __u32 __reserved; + __aligned_u64 ranges; }; +struct vfio_device_feature_dma_logging_range { + __aligned_u64 iova; + __aligned_u64 length; +}; + +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6 + +/* + * Upon VFIO_DEVICE_FEATURE_SET stop device DMA logging that was started + * by VFIO_DEVICE_FEATURE_DMA_LOGGING_START + */ +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7 + +/* + * Upon VFIO_DEVICE_FEATURE_GET read back and clear the device DMA log + * + * Query the device's DMA log for written pages within the given IOVA range. + * During querying the log is cleared for the IOVA range. + * + * bitmap is a pointer to an array of u64s that will hold the output bitmap + * with 1 bit reporting a page_size unit of IOVA. The mapping of IOVA to bits + * is given by: + * bitmap[(addr - iova)/page_size] & (1ULL << (addr % 64)) + * + * The input page_size can be any power of two value and does not have to + * match the value given to VFIO_DEVICE_FEATURE_DMA_LOGGING_START. The driver + * will format its internal logging to match the reporting page size, possibly + * by replicating bits if the internal page size is lower than requested. + * + * The LOGGING_REPORT will only set bits in the bitmap and never clear or + * perform any initialization of the user provided bitmap. + * + * If any error is returned userspace should assume that the dirty log is + * corrupted. Error recovery is to consider all memory dirty and try to + * restart the dirty tracking, or to abort/restart the whole migration. + * + * If DMA logging is not enabled, an error will be returned. + * + */ +struct vfio_device_feature_dma_logging_report { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 bitmap; +}; + +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 + +/* + * Upon VFIO_DEVICE_FEATURE_GET read back the estimated data length that will + * be required to complete stop copy. + * + * Note: Can be called on each device state. + */ + +struct vfio_device_feature_mig_data_size { + __aligned_u64 stop_copy_length; +}; + +#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 + /* -------- API for Type1 VFIO IOMMU -------- */ /** |