diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fea9487a263cd298a72832fd4b014d27f52eb7bd..9bdae309ff8e817e85b5bb6c82d040b34a03753e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1726,12 +1726,12 @@ By default, super page will be supported if Intel IOMMU has the capability. With this option, super page will not be supported. - ecs_off [Default Off] - By default, extended context tables will be supported if - the hardware advertises that it has support both for the - extended tables themselves, and also PASID support. With - this option set, extended tables will not be used even - on hardware which claims to support them. + sm_off [Default Off] + By default, scalable mode will be supported if the + hardware advertises that it has support for the scalable + mode translation. With this option set, scalable mode + will not be used even on hardware which claims to support + it. tboot_noforce [Default Off] Do not force the Intel IOMMU enabled under tboot. By default, tboot will force Intel IOMMU on, which diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a1f417694a48d8b3925e67857d12ffe1add4edb2..c60e294dc3cb5078054ba9e9492a819bebb72ced 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -448,7 +448,7 @@ static int iommu_init_device(struct device *dev) * invalid address), we ignore the capability for the device so * it'll be forced to go into translation mode. */ - if ((iommu_pass_through || !amd_iommu_force_isolation) && + if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) { struct amd_iommu *iommu; @@ -2226,7 +2226,7 @@ static int amd_iommu_add_device(struct device *dev) BUG_ON(!dev_data); - if (iommu_pass_through || dev_data->iommu_v2) + if (dev_data->iommu_v2) iommu_request_dm_for_dev(dev); /* Domains are initialized for this device - have a look what we ended up with */ @@ -2813,7 +2813,7 @@ int __init amd_iommu_init_api(void) int __init amd_iommu_init_dma_ops(void) { - swiotlb = (iommu_pass_through || sme_me_mask) ? 1 : 0; + swiotlb = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0; iommu_detected = 1; /* diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 187812e35c99bf62bfb632fc61fbd08ffc5abb63..0ef86f70b77a0b4f69d212d0081fc90b8e33270b 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -333,6 +333,11 @@ static void dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info) dmar_iommu_notify_scope_dev(info); } +static inline void vf_inherit_msi_domain(struct pci_dev *pdev) +{ + dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&pdev->physfn->dev)); +} + static int dmar_pci_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -342,8 +347,20 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb, /* Only care about add/remove events for physical functions. * For VFs we actually do the lookup based on the corresponding * PF in device_to_iommu() anyway. */ - if (pdev->is_virtfn) + if (pdev->is_virtfn) { + /* + * Ensure that the VF device inherits the irq domain of the + * PF device. Ideally the device would inherit the domain + * from the bus, but DMAR can have multiple units per bus + * which makes this impossible. The VF 'bus' could inherit + * from the PF device, but that's yet another x86'sism to + * inflict on everybody else. + */ + if (action == BUS_NOTIFY_ADD_DEVICE) + vf_inherit_msi_domain(pdev); return NOTIFY_DONE; + } + if (action != BUS_NOTIFY_ADD_DEVICE && action != BUS_NOTIFY_REMOVED_DEVICE) return NOTIFY_DONE; @@ -1173,6 +1190,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) int head, tail; struct q_inval *qi = iommu->qi; int wait_index = (index + 1) % QI_LENGTH; + int shift = qi_shift(iommu); if (qi->desc_status[wait_index] == QI_ABORT) return -EAGAIN; @@ -1186,13 +1204,19 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) */ if (fault & DMA_FSTS_IQE) { head = readl(iommu->reg + DMAR_IQH_REG); - if ((head >> DMAR_IQ_SHIFT) == index) { - pr_err("VT-d detected invalid descriptor: " - "low=%llx, high=%llx\n", - (unsigned long long)qi->desc[index].low, - (unsigned long long)qi->desc[index].high); - memcpy(&qi->desc[index], &qi->desc[wait_index], - sizeof(struct qi_desc)); + if ((head >> shift) == index) { + struct qi_desc *desc = qi->desc + head; + + /* + * desc->qw2 and desc->qw3 are either reserved or + * used by software as private data. We won't print + * out these two qw's for security consideration. + */ + pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n", + (unsigned long long)desc->qw0, + (unsigned long long)desc->qw1); + memcpy(desc, qi->desc + (wait_index << shift), + 1 << shift); writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG); return -EINVAL; } @@ -1204,10 +1228,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) */ if (fault & DMA_FSTS_ITE) { head = readl(iommu->reg + DMAR_IQH_REG); - head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH; + head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH; head |= 1; tail = readl(iommu->reg + DMAR_IQT_REG); - tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH; + tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH; writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG); @@ -1235,15 +1259,14 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) { int rc; struct q_inval *qi = iommu->qi; - struct qi_desc *hw, wait_desc; + int offset, shift, length; + struct qi_desc wait_desc; int wait_index, index; unsigned long flags; if (!qi) return 0; - hw = qi->desc; - restart: rc = 0; @@ -1256,16 +1279,21 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) index = qi->free_head; wait_index = (index + 1) % QI_LENGTH; + shift = qi_shift(iommu); + length = 1 << shift; qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE; - hw[index] = *desc; - - wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) | + offset = index << shift; + memcpy(qi->desc + offset, desc, length); + wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) | QI_IWD_STATUS_WRITE | QI_IWD_TYPE; - wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]); + wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]); + wait_desc.qw2 = 0; + wait_desc.qw3 = 0; - hw[wait_index] = wait_desc; + offset = wait_index << shift; + memcpy(qi->desc + offset, &wait_desc, length); qi->free_head = (qi->free_head + 2) % QI_LENGTH; qi->free_cnt -= 2; @@ -1274,7 +1302,7 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) * update the HW tail register indicating the presence of * new descriptors. */ - writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG); + writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG); while (qi->desc_status[wait_index] != QI_DONE) { /* @@ -1311,8 +1339,10 @@ void qi_global_iec(struct intel_iommu *iommu) { struct qi_desc desc; - desc.low = QI_IEC_TYPE; - desc.high = 0; + desc.qw0 = QI_IEC_TYPE; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; /* should never fail */ qi_submit_sync(&desc, iommu); @@ -1323,9 +1353,11 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, { struct qi_desc desc; - desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) + desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) | QI_CC_GRAN(type) | QI_CC_TYPE; - desc.high = 0; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; qi_submit_sync(&desc, iommu); } @@ -1344,10 +1376,12 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, if (cap_read_drain(iommu->cap)) dr = 1; - desc.low = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) + desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; - desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) + desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) | QI_IOTLB_AM(size_order); + desc.qw2 = 0; + desc.qw3 = 0; qi_submit_sync(&desc, iommu); } @@ -1360,15 +1394,17 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (mask) { WARN_ON_ONCE(addr & ((1ULL << (VTD_PAGE_SHIFT + mask)) - 1)); addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; - desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; + desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; } else - desc.high = QI_DEV_IOTLB_ADDR(addr); + desc.qw1 = QI_DEV_IOTLB_ADDR(addr); if (qdep >= QI_DEV_IOTLB_MAX_INVS) qdep = 0; - desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | + desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); + desc.qw2 = 0; + desc.qw3 = 0; qi_submit_sync(&desc, iommu); } @@ -1416,16 +1452,24 @@ static void __dmar_enable_qi(struct intel_iommu *iommu) u32 sts; unsigned long flags; struct q_inval *qi = iommu->qi; + u64 val = virt_to_phys(qi->desc); qi->free_head = qi->free_tail = 0; qi->free_cnt = QI_LENGTH; + /* + * Set DW=1 and QS=1 in IQA_REG when Scalable Mode capability + * is present. + */ + if (ecap_smts(iommu->ecap)) + val |= (1 << 11) | 1; + raw_spin_lock_irqsave(&iommu->register_lock, flags); /* write zero to the tail reg */ writel(0, iommu->reg + DMAR_IQT_REG); - dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc)); + dmar_writeq(iommu->reg + DMAR_IQA_REG, val); iommu->gcmd |= DMA_GCMD_QIE; writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); @@ -1461,8 +1505,12 @@ int dmar_enable_qi(struct intel_iommu *iommu) qi = iommu->qi; - - desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, 0); + /* + * Need two pages to accommodate 256 descriptors of 256 bits each + * if the remapping hardware supports scalable mode translation. + */ + desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, + !!ecap_smts(iommu->ecap)); if (!desc_page) { kfree(qi); iommu->qi = NULL; diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index fb63abcf6592784e63924d069e14b1e56bb07f63..1813f38f0b7213e99c53ba433907639faae57ce8 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -315,49 +315,6 @@ static inline void context_clear_entry(struct context_entry *context) context->hi = 0; } -/* - * 0: readable - * 1: writable - * 2-6: reserved - * 7: super page - * 8-10: available - * 11: snoop behavior - * 12-63: Host physcial address - */ -struct dma_pte { - u64 val; -}; - -static inline void dma_clear_pte(struct dma_pte *pte) -{ - pte->val = 0; -} - -static inline u64 dma_pte_addr(struct dma_pte *pte) -{ -#ifdef CONFIG_64BIT - return pte->val & VTD_PAGE_MASK; -#else - /* Must have a full atomic 64-bit read */ - return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK; -#endif -} - -static inline bool dma_pte_present(struct dma_pte *pte) -{ - return (pte->val & 3) != 0; -} - -static inline bool dma_pte_superpage(struct dma_pte *pte) -{ - return (pte->val & DMA_PTE_LARGE_PAGE); -} - -static inline int first_pte_in_page(struct dma_pte *pte) -{ - return !((unsigned long)pte & ~VTD_PAGE_MASK); -} - /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -367,14 +324,16 @@ static inline int first_pte_in_page(struct dma_pte *pte) static struct dmar_domain *si_domain; static int hw_pass_through = 1; +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) + /* - * Domain represents a virtual machine, more than one devices - * across iommus may be owned in one domain, e.g. kvm guest. + * This is a DMA domain allocated through the iommu domain allocation + * interface. But one or more devices belonging to this domain have + * been chosen to use a private domain. We should avoid to use the + * map/unmap/iova_to_phys APIs on it. */ -#define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0) - -/* si_domain contains mulitple devices */ -#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1) +#define DOMAIN_FLAG_LOSE_CHILDREN BIT(1) #define for_each_domain_iommu(idx, domain) \ for (idx = 0; idx < g_num_of_iommus; idx++) \ @@ -408,13 +367,13 @@ static int g_num_of_iommus; static void domain_exit(struct dmar_domain *domain); static void domain_remove_dev_info(struct dmar_domain *domain); -static void dmar_remove_one_dev_info(struct dmar_domain *domain, - struct device *dev); +static void dmar_remove_one_dev_info(struct device *dev); static void __dmar_remove_one_dev_info(struct device_domain_info *info); static void domain_context_clear(struct intel_iommu *iommu, struct device *dev); static int domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +static bool device_is_rmrr_locked(struct device *dev); #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON int dmar_disabled = 0; @@ -429,38 +388,16 @@ static int dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; static int intel_iommu_superpage = 1; -static int intel_iommu_ecs = 1; -static int intel_iommu_pasid28; +static int intel_iommu_sm = 1; static int iommu_identity_mapping; #define IDENTMAP_ALL 1 #define IDENTMAP_GFX 2 #define IDENTMAP_AZALIA 4 -/* Broadwell and Skylake have broken ECS support — normal so-called "second - * level" translation of DMA requests-without-PASID doesn't actually happen - * unless you also set the NESTE bit in an extended context-entry. Which of - * course means that SVM doesn't work because it's trying to do nested - * translation of the physical addresses it finds in the process page tables, - * through the IOVA->phys mapping found in the "second level" page tables. - * - * The VT-d specification was retroactively changed to change the definition - * of the capability bits and pretend that Broadwell/Skylake never happened... - * but unfortunately the wrong bit was changed. It's ECS which is broken, but - * for some reason it was the PASID capability bit which was redefined (from - * bit 28 on BDW/SKL to bit 40 in future). - * - * So our test for ECS needs to eschew those implementations which set the old - * PASID capabiity bit 28, since those are the ones on which ECS is broken. - * Unless we are working around the 'pasid28' limitations, that is, by putting - * the device into passthrough mode for normal DMA and thus masking the bug. - */ -#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \ - (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap))) -/* PASID support is thus enabled if ECS is enabled and *either* of the old - * or new capability bits are set. */ -#define pasid_enabled(iommu) (ecs_enabled(iommu) && \ - (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap))) +#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) +#define pasid_supported(iommu) (sm_supported(iommu) && \ + ecap_pasid((iommu)->ecap)) int intel_iommu_gfx_mapped; EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); @@ -471,22 +408,25 @@ static LIST_HEAD(device_domain_list); /* * Iterate over elements in device_domain_list and call the specified - * callback @fn against each element. This helper should only be used - * in the context where the device_domain_lock has already been holden. - */ + * callback @fn against each element. +*/ int for_each_device_domain(int (*fn)(struct device_domain_info *info, void *data), void *data) { int ret = 0; + unsigned long flags; struct device_domain_info *info; - assert_spin_locked(&device_domain_lock); + spin_lock_irqsave(&device_domain_lock, flags); list_for_each_entry(info, &device_domain_list, global) { ret = fn(info, data); - if (ret) - return ret; + if (ret) { + spin_unlock_irqrestore(&device_domain_lock, flags); + return ret; + } } - + spin_unlock_irqrestore(&device_domain_lock, flags); + return 0; } @@ -540,15 +480,9 @@ static int __init intel_iommu_setup(char *str) } else if (!strncmp(str, "sp_off", 6)) { pr_info("Disable supported super page\n"); intel_iommu_superpage = 0; - } else if (!strncmp(str, "ecs_off", 7)) { - printk(KERN_INFO - "Intel-IOMMU: disable extended context table support\n"); - intel_iommu_ecs = 0; - } else if (!strncmp(str, "pasid28", 7)) { - printk(KERN_INFO - "Intel-IOMMU: enable pre-production PASID support\n"); - intel_iommu_pasid28 = 1; - iommu_identity_mapping |= IDENTMAP_GFX; + } else if (!strncmp(str, "sm_off", 6)) { + pr_info("Intel-IOMMU: disable scalable mode support\n"); + intel_iommu_sm = 0; } else if (!strncmp(str, "tboot_noforce", 13)) { printk(KERN_INFO "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); @@ -632,22 +566,11 @@ static inline void free_devinfo_mem(void *vaddr) kmem_cache_free(iommu_devinfo_cache, vaddr); } -static inline int domain_type_is_vm(struct dmar_domain *domain) -{ - return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE; -} - static inline int domain_type_is_si(struct dmar_domain *domain) { return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; } -static inline int domain_type_is_vm_or_si(struct dmar_domain *domain) -{ - return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE | - DOMAIN_FLAG_STATIC_IDENTITY); -} - static inline int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { @@ -695,7 +618,9 @@ struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) int iommu_id; /* si_domain and vm domain should not get here. */ - BUG_ON(domain_type_is_vm_or_si(domain)); + if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) + return NULL; + for_each_domain_iommu(iommu_id, domain) break; @@ -795,7 +720,7 @@ static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu u64 *entry; entry = &root->lo; - if (ecs_enabled(iommu)) { + if (sm_supported(iommu)) { if (devfn >= 0x80) { devfn -= 0x80; entry = &root->hi; @@ -958,7 +883,7 @@ static void free_context_table(struct intel_iommu *iommu) if (context) free_pgtable_page(context); - if (!ecs_enabled(iommu)) + if (!sm_supported(iommu)) continue; context = iommu_context_addr(iommu, i, 0x80, 0); @@ -1310,8 +1235,6 @@ static void iommu_set_root_entry(struct intel_iommu *iommu) unsigned long flag; addr = virt_to_phys(iommu->root_entry); - if (ecs_enabled(iommu)) - addr |= DMA_RTADDR_RTT; raw_spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); @@ -1325,7 +1248,7 @@ static void iommu_set_root_entry(struct intel_iommu *iommu) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -static void iommu_flush_write_buffer(struct intel_iommu *iommu) +void iommu_flush_write_buffer(struct intel_iommu *iommu) { u32 val; unsigned long flag; @@ -1739,6 +1662,16 @@ static int iommu_init_domains(struct intel_iommu *iommu) */ set_bit(0, iommu->domain_ids); + /* + * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid + * entry for first-level or pass-through translation modes should + * be programmed with a domain id different from those used for + * second-level or nested translation. We reserve a domain id for + * this purpose. + */ + if (sm_supported(iommu)) + set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); + return 0; } @@ -1750,32 +1683,15 @@ static void disable_dmar_iommu(struct intel_iommu *iommu) if (!iommu->domains || !iommu->domain_ids) return; -again: spin_lock_irqsave(&device_domain_lock, flags); list_for_each_entry_safe(info, tmp, &device_domain_list, global) { - struct dmar_domain *domain; - if (info->iommu != iommu) continue; if (!info->dev || !info->domain) continue; - domain = info->domain; - __dmar_remove_one_dev_info(info); - - if (!domain_type_is_vm_or_si(domain)) { - /* - * The domain_exit() function can't be called under - * device_domain_lock, as it takes this lock itself. - * So release the lock here and re-run the loop - * afterwards. - */ - spin_unlock_irqrestore(&device_domain_lock, flags); - domain_exit(domain); - goto again; - } } spin_unlock_irqrestore(&device_domain_lock, flags); @@ -1803,7 +1719,7 @@ static void free_dmar_iommu(struct intel_iommu *iommu) free_context_table(iommu); #ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_enabled(iommu)) { + if (pasid_supported(iommu)) { if (ecap_prs(iommu->ecap)) intel_svm_finish_prq(iommu); intel_svm_exit(iommu); @@ -2006,8 +1922,6 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu, static void domain_exit(struct dmar_domain *domain) { - struct page *freelist = NULL; - /* Domain 0 is reserved, so dont process it */ if (!domain) return; @@ -2020,9 +1934,12 @@ static void domain_exit(struct dmar_domain *domain) /* destroy iovas */ put_iova_domain(&domain->iovad); - freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + if (domain->pgd) { + struct page *freelist; - dma_free_pagelist(freelist); + freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); + dma_free_pagelist(freelist); + } free_domain_mem(domain); } @@ -2362,7 +2279,7 @@ static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, struct scatterlist *sg, unsigned long phys_pfn, unsigned long nr_pages, int prot) { - int ret; + int iommu_id, ret; struct intel_iommu *iommu; /* Do the real mapping first */ @@ -2370,17 +2287,8 @@ static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if (ret) return ret; - /* Notify about the new mapping */ - if (domain_type_is_vm(domain)) { - /* VM typed domains can have more than one IOMMUs */ - int iommu_id; - for_each_domain_iommu(iommu_id, domain) { - iommu = g_iommus[iommu_id]; - __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); - } - } else { - /* General domains only have one IOMMU */ - iommu = domain_get_iommu(domain); + for_each_domain_iommu(iommu_id, domain) { + iommu = g_iommus[iommu_id]; __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); } @@ -2503,6 +2411,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, info->domain = domain; info->iommu = iommu; info->pasid_table = NULL; + info->auxd_enabled = 0; + INIT_LIST_HEAD(&info->auxiliary_domains); if (dev && dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(info->dev); @@ -2513,8 +2423,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, dmar_find_matched_atsr_unit(pdev)) info->ats_supported = 1; - if (ecs_enabled(iommu)) { - if (pasid_enabled(iommu)) { + if (sm_supported(iommu)) { + if (pasid_supported(iommu)) { int features = pci_pasid_features(pdev); if (features >= 0) info->pasid_supported = features | 1; @@ -2560,20 +2470,22 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, list_add(&info->global, &device_domain_list); if (dev) dev->archdata.iommu = info; + spin_unlock_irqrestore(&device_domain_lock, flags); - if (dev && dev_is_pci(dev) && info->pasid_supported) { + /* PASID table is mandatory for a PCI device in scalable mode. */ + if (dev && dev_is_pci(dev) && sm_supported(iommu)) { ret = intel_pasid_alloc_table(dev); if (ret) { - pr_warn("No pasid table for %s, pasid disabled\n", - dev_name(dev)); - info->pasid_supported = 0; + pr_err("PASID table allocation for %s failed\n", + dev_name(dev)); + dmar_remove_one_dev_info(dev); + return NULL; } } - spin_unlock_irqrestore(&device_domain_lock, flags); if (dev && domain_context_mapping(domain, dev)) { pr_err("Domain context map for %s failed\n", dev_name(dev)); - dmar_remove_one_dev_info(domain, dev); + dmar_remove_one_dev_info(dev); return NULL; } @@ -2688,7 +2600,6 @@ static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw) } out: - return domain; } @@ -2807,36 +2718,13 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, rmrr->end_address); } -#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA -static inline void iommu_prepare_isa(void) -{ - struct pci_dev *pdev; - int ret; - - pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); - if (!pdev) - return; - - pr_info("Prepare 0-16MiB unity mapping for LPC\n"); - ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1); - - if (ret) - pr_err("Failed to create 0-16MiB identity map - floppy might not work\n"); - - pci_dev_put(pdev); -} -#else -static inline void iommu_prepare_isa(void) -{ - return; -} -#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */ - static int md_domain_init(struct dmar_domain *domain, int guest_width); static int __init si_domain_init(int hw) { - int nid, ret = 0; + struct dmar_rmrr_unit *rmrr; + struct device *dev; + int i, nid, ret; si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); if (!si_domain) @@ -2847,8 +2735,6 @@ static int __init si_domain_init(int hw) return -EFAULT; } - pr_debug("Identity mapping domain allocated\n"); - if (hw) return 0; @@ -2864,6 +2750,31 @@ static int __init si_domain_init(int hw) } } + /* + * Normally we use DMA domains for devices which have RMRRs. But we + * loose this requirement for graphic and usb devices. Identity map + * the RMRRs for graphic and USB devices so that they could use the + * si_domain. + */ + for_each_rmrr_units(rmrr) { + for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, + i, dev) { + unsigned long long start = rmrr->base_address; + unsigned long long end = rmrr->end_address; + + if (device_is_rmrr_locked(dev)) + continue; + + if (WARN_ON(end < start || + end >> agaw_to_width(si_domain->agaw))) + continue; + + ret = iommu_domain_identity_map(si_domain, start, end); + if (ret) + return ret; + } + } + return 0; } @@ -2871,9 +2782,6 @@ static int identity_mapping(struct device *dev) { struct device_domain_info *info; - if (likely(!iommu_identity_mapping)) - return 0; - info = dev->archdata.iommu; if (info && info != DUMMY_DEVICE_DOMAIN_INFO) return (info->domain == si_domain); @@ -2959,23 +2867,31 @@ static bool device_is_rmrr_locked(struct device *dev) return true; } -static int iommu_should_identity_map(struct device *dev, int startup) +/* + * Return the required default domain type for a specific device. + * + * @dev: the device in query + * @startup: true if this is during early boot + * + * Returns: + * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain + * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain + * - 0: both identity and dynamic domains work for this device + */ +static int device_def_domain_type(struct device *dev) { if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); if (device_is_rmrr_locked(dev)) - return 0; + return IOMMU_DOMAIN_DMA; if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) - return 1; + return IOMMU_DOMAIN_IDENTITY; if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) - return 1; - - if (!(iommu_identity_mapping & IDENTMAP_ALL)) - return 0; + return IOMMU_DOMAIN_IDENTITY; /* * We want to start off with all devices in the 1:1 domain, and @@ -2996,93 +2912,23 @@ static int iommu_should_identity_map(struct device *dev, int startup) */ if (!pci_is_pcie(pdev)) { if (!pci_is_root_bus(pdev->bus)) - return 0; + return IOMMU_DOMAIN_DMA; if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI) - return 0; + return IOMMU_DOMAIN_DMA; } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE) - return 0; + return IOMMU_DOMAIN_DMA; } else { if (device_has_rmrr(dev)) - return 0; - } - - /* - * At boot time, we don't yet know if devices will be 64-bit capable. - * Assume that they will — if they turn out not to be, then we can - * take them out of the 1:1 domain later. - */ - if (!startup) { - /* - * If the device's dma_mask is less than the system's memory - * size then this is not a candidate for identity mapping. - */ - u64 dma_mask = *dev->dma_mask; - - if (dev->coherent_dma_mask && - dev->coherent_dma_mask < dma_mask) - dma_mask = dev->coherent_dma_mask; - - return dma_mask >= dma_get_required_mask(dev); + return IOMMU_DOMAIN_DMA; } - return 1; -} - -static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw) -{ - int ret; - - if (!iommu_should_identity_map(dev, 1)) - return 0; - - ret = domain_add_dev_info(si_domain, dev); - if (!ret) - pr_info("%s identity mapping for device %s\n", - hw ? "Hardware" : "Software", dev_name(dev)); - else if (ret == -ENODEV) - /* device not associated with an iommu */ - ret = 0; - - return ret; + return (iommu_identity_mapping & IDENTMAP_ALL) ? + IOMMU_DOMAIN_IDENTITY : 0; } - -static int __init iommu_prepare_static_identity_mapping(int hw) +static inline int iommu_should_identity_map(struct device *dev) { - struct pci_dev *pdev = NULL; - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - struct device *dev; - int i; - int ret = 0; - - for_each_pci_dev(pdev) { - ret = dev_prepare_static_identity_mapping(&pdev->dev, hw); - if (ret) - return ret; - } - - for_each_active_iommu(iommu, drhd) - for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) { - struct acpi_device_physical_node *pn; - struct acpi_device *adev; - - if (dev->bus != &acpi_bus_type) - continue; - - adev= to_acpi_device(dev); - mutex_lock(&adev->physical_node_lock); - list_for_each_entry(pn, &adev->physical_node_list, node) { - ret = dev_prepare_static_identity_mapping(pn->dev, hw); - if (ret) - break; - } - mutex_unlock(&adev->physical_node_lock); - if (ret) - return ret; - } - - return 0; + return device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY; } static void intel_iommu_init_qi(struct intel_iommu *iommu) @@ -3307,11 +3153,8 @@ static int copy_translation_tables(struct intel_iommu *iommu) static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; - struct dmar_rmrr_unit *rmrr; - bool copied_tables = false; - struct device *dev; struct intel_iommu *iommu; - int i, ret; + int ret; /* * for each drhd @@ -3350,7 +3193,7 @@ static int __init init_dmars(void) * We need to ensure the system pasid table is no bigger * than the smallest supported. */ - if (pasid_enabled(iommu)) { + if (pasid_supported(iommu)) { u32 temp = 2 << ecap_pss(iommu->ecap); intel_pasid_max_id = min_t(u32, temp, @@ -3404,14 +3247,13 @@ static int __init init_dmars(void) } else { pr_info("Copied translation tables from previous kernel for %s\n", iommu->name); - copied_tables = true; } } if (!ecap_pass_through(iommu->ecap)) hw_pass_through = 0; #ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_enabled(iommu)) + if (pasid_supported(iommu)) intel_svm_init(iommu); #endif } @@ -3428,7 +3270,7 @@ static int __init init_dmars(void) iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } - if (iommu_pass_through) + if (iommu_default_passthrough()) iommu_identity_mapping |= IDENTMAP_ALL; #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA @@ -3440,62 +3282,9 @@ static int __init init_dmars(void) check_tylersburg_isoch(); - if (iommu_identity_mapping) { - ret = si_domain_init(hw_pass_through); - if (ret) - goto free_iommu; - } - - - /* - * If we copied translations from a previous kernel in the kdump - * case, we can not assign the devices to domains now, as that - * would eliminate the old mappings. So skip this part and defer - * the assignment to device driver initialization time. - */ - if (copied_tables) - goto domains_done; - - /* - * If pass through is not set or not enabled, setup context entries for - * identity mappings for rmrr, gfx, and isa and may fall back to static - * identity mapping if iommu_identity_mapping is set. - */ - if (iommu_identity_mapping) { - ret = iommu_prepare_static_identity_mapping(hw_pass_through); - if (ret) { - pr_crit("Failed to setup IOMMU pass-through\n"); - goto free_iommu; - } - } - /* - * For each rmrr - * for each dev attached to rmrr - * do - * locate drhd for dev, alloc domain for dev - * allocate free domain - * allocate page table entries for rmrr - * if context not allocated for bus - * allocate and init context - * set present in root table for this bus - * init context with domain, translation etc - * endfor - * endfor - */ - pr_info("Setting RMRR:\n"); - for_each_rmrr_units(rmrr) { - /* some BIOS lists non-exist devices in DMAR table. */ - for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, - i, dev) { - ret = iommu_prepare_rmrr_dev(rmrr, dev); - if (ret) - pr_err("Mapping reserved region failed\n"); - } - } - - iommu_prepare_isa(); - -domains_done: + ret = si_domain_init(hw_pass_through); + if (ret) + goto free_iommu; /* * for each drhd @@ -3518,7 +3307,7 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); #ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) { + if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. @@ -3533,11 +3322,6 @@ static int __init init_dmars(void) ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; - - if (!translation_pre_enabled(iommu)) - iommu_enable_translation(iommu); - - iommu_disable_protect_mem_regions(iommu); } return 0; @@ -3588,16 +3372,17 @@ static unsigned long intel_alloc_iova(struct device *dev, return iova_pfn; } -struct dmar_domain *get_valid_domain_for_dev(struct device *dev) +static struct dmar_domain *get_private_domain_for_dev(struct device *dev) { struct dmar_domain *domain, *tmp; struct dmar_rmrr_unit *rmrr; struct device *i_dev; int i, ret; + /* Device shouldn't be attached by any domains. */ domain = find_domain(dev); if (domain) - goto out; + return NULL; domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) @@ -3638,42 +3423,38 @@ struct dmar_domain *get_valid_domain_for_dev(struct device *dev) /* Check if the dev needs to go through non-identity map and unmap process.*/ static int iommu_no_mapping(struct device *dev) { - int found; + int ret; if (iommu_dummy(dev)) return 1; + ret = identity_mapping(dev); + if (ret) { + u64 dma_mask = *dev->dma_mask; + if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask) + dma_mask = dev->coherent_dma_mask; - if (!iommu_identity_mapping) - return 0; - - found = identity_mapping(dev); - if (found) { - if (iommu_should_identity_map(dev, 0)) + if (dma_mask >= dma_get_required_mask(dev)) return 1; - else { - /* - * 32 bit DMA is removed from si_domain and fall back - * to non-identity mapping. - */ - dmar_remove_one_dev_info(si_domain, dev); - pr_info("32bit %s uses non-identity mapping\n", - dev_name(dev)); - return 0; - } - } else { /* - * In case of a detached 64 bit DMA device from vm, the device - * is put into si_domain for identity mapping. + * 32 bit DMA is removed from si_domain and fall back + * to non-identity mapping. */ - if (iommu_should_identity_map(dev, 0)) { - int ret; - ret = domain_add_dev_info(si_domain, dev); - if (!ret) { - pr_info("64bit %s uses identity mapping\n", - dev_name(dev)); - return 1; + dmar_remove_one_dev_info(dev); + ret = iommu_request_dma_domain_for_dev(dev); + if (ret) { + struct iommu_domain *domain; + struct dmar_domain *dmar_domain; + + domain = iommu_get_domain_for_dev(dev); + if (domain) { + dmar_domain = to_dmar_domain(domain); + dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN; } + dmar_remove_one_dev_info(dev); + get_private_domain_for_dev(dev); } + + dev_info(dev, "32bit DMA uses non-identity mapping\n"); } return 0; @@ -3695,7 +3476,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, if (iommu_no_mapping(dev)) return paddr; - domain = get_valid_domain_for_dev(dev); + domain = find_domain(dev); if (!domain) return 0; @@ -3907,7 +3688,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele if (iommu_no_mapping(dev)) return intel_nontranslate_map_sg(dev, sglist, nelems, dir); - domain = get_valid_domain_for_dev(dev); + domain = find_domain(dev); if (!domain) return 0; @@ -4406,7 +4187,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) goto out; #ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_enabled(iommu)) + if (pasid_supported(iommu)) intel_svm_init(iommu); #endif @@ -4423,7 +4204,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) iommu_flush_write_buffer(iommu); #ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) { + if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { ret = intel_svm_enable_prq(iommu); if (ret) goto disable_iommu; @@ -4620,12 +4401,9 @@ static int device_notifier(struct notifier_block *nb, if (!domain) return 0; - dmar_remove_one_dev_info(domain, dev); - if (!domain_type_is_vm_or_si(domain) && - list_empty(&domain->devices)) - domain_exit(domain); + dmar_remove_one_dev_info(dev); } else if (action == BUS_NOTIFY_ADD_DEVICE) { - if (iommu_should_identity_map(dev, 1)) + if (iommu_should_identity_map(dev)) domain_add_dev_info(si_domain, dev); } @@ -4911,7 +4689,6 @@ int __init intel_iommu_init(void) goto out_free_reserved_range; } up_write(&dmar_global_lock); - pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB) swiotlb = 0; @@ -4934,6 +4711,16 @@ int __init intel_iommu_init(void) register_memory_notifier(&intel_iommu_memory_nb); cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, intel_iommu_cpu_dead); + + /* Finally, we enable the DMA remapping hardware. */ + for_each_iommu(iommu, drhd) { + if (!translation_pre_enabled(iommu)) + iommu_enable_translation(iommu); + + iommu_disable_protect_mem_regions(iommu); + } + pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); + intel_iommu_enabled = 1; return 0; @@ -4971,6 +4758,7 @@ static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) static void __dmar_remove_one_dev_info(struct device_domain_info *info) { + struct dmar_domain *domain; struct intel_iommu *iommu; unsigned long flags; @@ -4980,6 +4768,7 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info) return; iommu = info->iommu; + domain = info->domain; if (info->dev) { iommu_disable_dev_iotlb(info); @@ -4990,21 +4779,26 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info) unlink_domain_info(info); spin_lock_irqsave(&iommu->lock, flags); - domain_detach_iommu(info->domain, iommu); + domain_detach_iommu(domain, iommu); spin_unlock_irqrestore(&iommu->lock, flags); + /* free the private domain */ + if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN && + !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) + domain_exit(info->domain); + free_devinfo_mem(info); } -static void dmar_remove_one_dev_info(struct dmar_domain *domain, - struct device *dev) +static void dmar_remove_one_dev_info(struct device *dev) { struct device_domain_info *info; unsigned long flags; spin_lock_irqsave(&device_domain_lock, flags); info = dev->archdata.iommu; - __dmar_remove_one_dev_info(info); + if (info) + __dmar_remove_one_dev_info(info); spin_unlock_irqrestore(&device_domain_lock, flags); } @@ -5038,63 +4832,184 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) struct dmar_domain *dmar_domain; struct iommu_domain *domain; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; + switch (type) { + case IOMMU_DOMAIN_DMA: + /* fallthrough */ + case IOMMU_DOMAIN_UNMANAGED: + dmar_domain = alloc_domain(0); + if (!dmar_domain) { + pr_err("Can't allocate dmar_domain\n"); + return NULL; + } + if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + pr_err("Domain initialization failed\n"); + domain_exit(dmar_domain); + return NULL; + } - dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE); - if (!dmar_domain) { - pr_err("Can't allocate dmar_domain\n"); - return NULL; - } - if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - pr_err("Domain initialization failed\n"); - domain_exit(dmar_domain); + if (type == IOMMU_DOMAIN_DMA && + init_iova_flush_queue(&dmar_domain->iovad, + iommu_flush_iova, iova_entry_free)) { + pr_warn("iova flush queue initialization failed\n"); + intel_iommu_strict = 1; + } + + domain_update_iommu_cap(dmar_domain); + domain = &dmar_domain->domain; + domain->geometry.aperture_start = 0; + domain->geometry.aperture_end = + __DOMAIN_MAX_ADDR(dmar_domain->gaw); + domain->geometry.force_aperture = true; + + return domain; + case IOMMU_DOMAIN_IDENTITY: + return &si_domain->domain; + default: return NULL; } - domain_update_iommu_cap(dmar_domain); - - domain = &dmar_domain->domain; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw); - domain->geometry.force_aperture = true; - return domain; + return NULL; } static void intel_iommu_domain_free(struct iommu_domain *domain) { - domain_exit(to_dmar_domain(domain)); + if (domain != &si_domain->domain) + domain_exit(to_dmar_domain(domain)); } -static int intel_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) +/* + * Check whether a @domain could be attached to the @dev through the + * aux-domain attach/detach APIs. + */ +static inline bool +is_aux_domain(struct device *dev, struct iommu_domain *domain) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu; - int addr_width; - u8 bus, devfn; + struct device_domain_info *info = dev->archdata.iommu; - if (device_is_rmrr_locked(dev)) { - dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); - return -EPERM; - } + return info && info->auxd_enabled && + domain->type == IOMMU_DOMAIN_UNMANAGED; +} - /* normally dev is not mapped */ - if (unlikely(domain_context_mapped(dev))) { - struct dmar_domain *old_domain; +static void auxiliary_link_device(struct dmar_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev->archdata.iommu; - old_domain = find_domain(dev); - if (old_domain) { - rcu_read_lock(); - dmar_remove_one_dev_info(old_domain, dev); - rcu_read_unlock(); + assert_spin_locked(&device_domain_lock); + if (WARN_ON(!info)) + return; + + domain->auxd_refcnt++; + list_add(&domain->auxd, &info->auxiliary_domains); +} + +static void auxiliary_unlink_device(struct dmar_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev->archdata.iommu; + + assert_spin_locked(&device_domain_lock); + if (WARN_ON(!info)) + return; + + list_del(&domain->auxd); + domain->auxd_refcnt--; + + if (!domain->auxd_refcnt && domain->default_pasid > 0) + intel_pasid_free_id(domain->default_pasid); +} + +static int aux_domain_add_dev(struct dmar_domain *domain, + struct device *dev) +{ + int ret; + u8 bus, devfn; + unsigned long flags; + struct intel_iommu *iommu; + + iommu = device_to_iommu(dev, &bus, &devfn); + if (!iommu) + return -ENODEV; - if (!domain_type_is_vm_or_si(old_domain) && - list_empty(&old_domain->devices)) - domain_exit(old_domain); + if (domain->default_pasid <= 0) { + int pasid; + + pasid = intel_pasid_alloc_id(domain, PASID_MIN, + pci_max_pasids(to_pci_dev(dev)), + GFP_KERNEL); + if (pasid <= 0) { + pr_err("Can't allocate default pasid\n"); + return -ENODEV; } + domain->default_pasid = pasid; } + spin_lock_irqsave(&device_domain_lock, flags); + /* + * iommu->lock must be held to attach domain to iommu and setup the + * pasid entry for second level translation. + */ + spin_lock(&iommu->lock); + ret = domain_attach_iommu(domain, iommu); + if (ret) + goto attach_failed; + + /* Setup the PASID entry for mediated devices: */ + ret = intel_pasid_setup_second_level(iommu, domain, dev, + domain->default_pasid); + if (ret) + goto table_failed; + spin_unlock(&iommu->lock); + + auxiliary_link_device(domain, dev); + + spin_unlock_irqrestore(&device_domain_lock, flags); + + return 0; + +table_failed: + domain_detach_iommu(domain, iommu); +attach_failed: + spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&device_domain_lock, flags); + if (!domain->auxd_refcnt && domain->default_pasid > 0) + intel_pasid_free_id(domain->default_pasid); + + return ret; +} + +static void aux_domain_remove_dev(struct dmar_domain *domain, + struct device *dev) +{ + struct device_domain_info *info; + struct intel_iommu *iommu; + unsigned long flags; + + if (!is_aux_domain(dev, &domain->domain)) + return; + + spin_lock_irqsave(&device_domain_lock, flags); + info = dev->archdata.iommu; + iommu = info->iommu; + + auxiliary_unlink_device(domain, dev); + + spin_lock(&iommu->lock); + intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid); + domain_detach_iommu(domain, iommu); + spin_unlock(&iommu->lock); + + spin_unlock_irqrestore(&device_domain_lock, flags); +} + +static int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu; + int addr_width; + u8 bus, devfn; + iommu = device_to_iommu(dev, &bus, &devfn); if (!iommu) return -ENODEV; @@ -5127,13 +5042,63 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, dmar_domain->agaw--; } - return domain_add_dev_info(dmar_domain, dev); + return 0; +} + +static int intel_iommu_attach_device(struct iommu_domain *domain, + struct device *dev) +{ + int ret; + + if (device_is_rmrr_locked(dev)) { + dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); + return -EPERM; + } + + if (is_aux_domain(dev, domain)) + return -EPERM; + + /* normally dev is not mapped */ + if (unlikely(domain_context_mapped(dev))) { + struct dmar_domain *old_domain; + + old_domain = find_domain(dev); + if (old_domain) + dmar_remove_one_dev_info(dev); + } + + ret = prepare_domain_attach_device(domain, dev); + if (ret) + return ret; + + return domain_add_dev_info(to_dmar_domain(domain), dev); +} + +static int intel_iommu_aux_attach_device(struct iommu_domain *domain, + struct device *dev) +{ + int ret; + + if (!is_aux_domain(dev, domain)) + return -EPERM; + + ret = prepare_domain_attach_device(domain, dev); + if (ret) + return ret; + + return aux_domain_add_dev(to_dmar_domain(domain), dev); } static void intel_iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - dmar_remove_one_dev_info(to_dmar_domain(domain), dev); + dmar_remove_one_dev_info(dev); +} + +static void intel_iommu_aux_detach_device(struct iommu_domain *domain, + struct device *dev) +{ + aux_domain_remove_dev(to_dmar_domain(domain), dev); } static int intel_iommu_map(struct iommu_domain *domain, @@ -5186,7 +5151,6 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, /* Cope with horrid API which requires us to unmap more than the size argument if it happens to be a large-page mapping. */ BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); - if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) size = VTD_PAGE_SIZE << level_to_offset_bits(level); @@ -5224,6 +5188,42 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, return phys; } +static inline bool scalable_mode_support(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + bool ret = true; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!sm_supported(iommu)) { + ret = false; + break; + } + } + rcu_read_unlock(); + + return ret; +} + +static inline bool iommu_pasid_support(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + bool ret = true; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!pasid_supported(iommu)) { + ret = false; + break; + } + } + rcu_read_unlock(); + + return ret; +} + static bool intel_iommu_capable(enum iommu_cap cap) { if (cap == IOMMU_CAP_CACHE_COHERENCY) @@ -5236,6 +5236,8 @@ static bool intel_iommu_capable(enum iommu_cap cap) static int intel_iommu_add_device(struct device *dev) { + struct dmar_domain *dmar_domain; + struct iommu_domain *domain; struct intel_iommu *iommu; struct iommu_group *group; u8 bus, devfn; @@ -5255,6 +5257,44 @@ static int intel_iommu_add_device(struct device *dev) } iommu_group_put(group); + + domain = iommu_get_domain_for_dev(dev); + dmar_domain = to_dmar_domain(domain); + if (domain->type == IOMMU_DOMAIN_DMA) { + if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) { + ret = iommu_request_dm_for_dev(dev); + if (ret) { + dmar_remove_one_dev_info(dev); + dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN; + domain_add_dev_info(si_domain, dev); + dev_info(dev, + "Device uses a private identity domain.\n"); + return 0; + } + + return -ENODEV; + } + } else { + if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) { + ret = iommu_request_dma_domain_for_dev(dev); + if (ret) { + dmar_remove_one_dev_info(dev); + dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN; + if (!get_private_domain_for_dev(dev)) { + dev_warn(dev, + "Failed to get a private domain.\n"); + return -ENOMEM; + } + + dev_info(dev, + "Device uses a private dma domain.\n"); + return 0; + } + + return -ENODEV; + } + } + return 0; unlink: @@ -5308,6 +5348,19 @@ static void intel_iommu_get_resv_regions(struct device *device, } up_read(&dmar_global_lock); +#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA + if (dev_is_pci(device)) { + struct pci_dev *pdev = to_pci_dev(device); + + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { + reg = iommu_alloc_resv_region(0, 1UL << 24, 0, + IOMMU_RESV_DIRECT); + if (reg) + list_add_tail(®->list, head); + } + } +#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ + reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 0, IOMMU_RESV_MSI); @@ -5325,21 +5378,7 @@ static void intel_iommu_put_resv_regions(struct device *dev, kfree(entry); } -#ifdef CONFIG_INTEL_IOMMU_SVM -#define MAX_NR_PASID_BITS (20) -static inline unsigned long intel_iommu_get_pts(struct device *dev) -{ - int pts, max_pasid; - - max_pasid = intel_pasid_get_dev_max_id(dev); - pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS); - if (pts < 5) - return 0; - - return pts - 5; -} - -int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev) +int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) { struct device_domain_info *info; struct context_entry *context; @@ -5348,7 +5387,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sd u64 ctx_lo; int ret; - domain = get_valid_domain_for_dev(sdev->dev); + domain = find_domain(dev); if (!domain) return -EINVAL; @@ -5356,7 +5395,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sd spin_lock(&iommu->lock); ret = -EINVAL; - info = sdev->dev->archdata.iommu; + info = dev->archdata.iommu; if (!info || !info->pasid_supported) goto out; @@ -5366,40 +5405,13 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sd ctx_lo = context[0].lo; - sdev->did = domain->iommu_did[iommu->seq_id]; - sdev->sid = PCI_DEVID(info->bus, info->devfn); - if (!(ctx_lo & CONTEXT_PASIDE)) { - if (iommu->pasid_state_table) - context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table); - context[1].lo = (u64)virt_to_phys(info->pasid_table->table) | - intel_iommu_get_pts(sdev->dev); - - wmb(); - /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both - * extended to permit requests-with-PASID if the PASIDE bit - * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH, - * however, the PASIDE bit is ignored and requests-with-PASID - * are unconditionally blocked. Which makes less sense. - * So convert from CONTEXT_TT_PASS_THROUGH to one of the new - * "guest mode" translation types depending on whether ATS - * is available or not. Annoyingly, we can't use the new - * modes *unless* PASIDE is set. */ - if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) { - ctx_lo &= ~CONTEXT_TT_MASK; - if (info->ats_supported) - ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2; - else - ctx_lo |= CONTEXT_TT_PT_PASID << 2; - } ctx_lo |= CONTEXT_PASIDE; - if (iommu->pasid_state_table) - ctx_lo |= CONTEXT_DINVE; - if (info->pri_supported) - ctx_lo |= CONTEXT_PRS; context[0].lo = ctx_lo; wmb(); - iommu->flush.flush_context(iommu, sdev->did, sdev->sid, + iommu->flush.flush_context(iommu, + domain->iommu_did[iommu->seq_id], + PCI_DEVID(info->bus, info->devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); } @@ -5408,12 +5420,6 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sd if (!info->pasid_enabled) iommu_enable_dev_iotlb(info); - if (info->ats_enabled) { - sdev->dev_iotlb = 1; - sdev->qdep = info->ats_qdep; - if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) - sdev->qdep = 0; - } ret = 0; out: @@ -5423,6 +5429,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sd return ret; } +#ifdef CONFIG_INTEL_IOMMU_SVM struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) { struct intel_iommu *iommu; @@ -5444,12 +5451,142 @@ struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) } #endif /* CONFIG_INTEL_IOMMU_SVM */ +static int intel_iommu_enable_auxd(struct device *dev) +{ + struct device_domain_info *info; + struct intel_iommu *iommu; + unsigned long flags; + u8 bus, devfn; + int ret; + + iommu = device_to_iommu(dev, &bus, &devfn); + if (!iommu || dmar_disabled) + return -EINVAL; + + if (!sm_supported(iommu) || !pasid_supported(iommu)) + return -EINVAL; + + ret = intel_iommu_enable_pasid(iommu, dev); + if (ret) + return -ENODEV; + + spin_lock_irqsave(&device_domain_lock, flags); + info = dev->archdata.iommu; + info->auxd_enabled = 1; + spin_unlock_irqrestore(&device_domain_lock, flags); + + return 0; +} + +static int intel_iommu_disable_auxd(struct device *dev) +{ + struct device_domain_info *info; + unsigned long flags; + + spin_lock_irqsave(&device_domain_lock, flags); + info = dev->archdata.iommu; + if (!WARN_ON(!info)) + info->auxd_enabled = 0; + spin_unlock_irqrestore(&device_domain_lock, flags); + + return 0; +} + +/* + * A PCI express designated vendor specific extended capability is defined + * in the section 3.7 of Intel scalable I/O virtualization technical spec + * for system software and tools to detect endpoint devices supporting the + * Intel scalable IO virtualization without host driver dependency. + * + * Returns the address of the matching extended capability structure within + * the device's PCI configuration space or 0 if the device does not support + * it. + */ +static int siov_find_pci_dvsec(struct pci_dev *pdev) +{ + int pos; + u16 vendor, id; + + pos = pci_find_next_ext_capability(pdev, 0, 0x23); + while (pos) { + pci_read_config_word(pdev, pos + 4, &vendor); + pci_read_config_word(pdev, pos + 8, &id); + if (vendor == PCI_VENDOR_ID_INTEL && id == 5) + return pos; + + pos = pci_find_next_ext_capability(pdev, pos, 0x23); + } + + return 0; +} + +static bool +intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) +{ + if (feat == IOMMU_DEV_FEAT_AUX) { + int ret; + + if (!dev_is_pci(dev) || dmar_disabled || + !scalable_mode_support() || !iommu_pasid_support()) + return false; + + ret = pci_pasid_features(to_pci_dev(dev)); + if (ret < 0) + return false; + + return !!siov_find_pci_dvsec(to_pci_dev(dev)); + } + + return false; +} + +static int +intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) +{ + if (feat == IOMMU_DEV_FEAT_AUX) + return intel_iommu_enable_auxd(dev); + + return -ENODEV; +} + +static int +intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) +{ + if (feat == IOMMU_DEV_FEAT_AUX) + return intel_iommu_disable_auxd(dev); + + return -ENODEV; +} + +static bool +intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) +{ + struct device_domain_info *info = dev->archdata.iommu; + + if (feat == IOMMU_DEV_FEAT_AUX) + return scalable_mode_support() && info && info->auxd_enabled; + + return false; +} + +static int +intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + return dmar_domain->default_pasid > 0 ? + dmar_domain->default_pasid : -EINVAL; +} + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .domain_alloc = intel_iommu_domain_alloc, .domain_free = intel_iommu_domain_free, .attach_dev = intel_iommu_attach_device, .detach_dev = intel_iommu_detach_device, + .aux_attach_dev = intel_iommu_aux_attach_device, + .aux_detach_dev = intel_iommu_aux_detach_device, + .aux_get_pasid = intel_iommu_aux_get_pasid, .map = intel_iommu_map, .unmap = intel_iommu_unmap, .iova_to_phys = intel_iommu_iova_to_phys, @@ -5458,6 +5595,10 @@ const struct iommu_ops intel_iommu_ops = { .get_resv_regions = intel_iommu_get_resv_regions, .put_resv_regions = intel_iommu_put_resv_regions, .device_group = pci_device_group, + .dev_has_feat = intel_iommu_dev_has_feat, + .dev_feat_enabled = intel_iommu_dev_feat_enabled, + .dev_enable_feat = intel_iommu_dev_enable_feat, + .dev_disable_feat = intel_iommu_dev_disable_feat, .pgsize_bitmap = INTEL_IOMMU_PGSIZES, }; diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 1a82ee1c6e3d90bced9c86170ffc5159a169e2f4..a53a308efa20c5fd83a2bb592c4fc9e0cb0b37bf 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) "DMAR: " fmt +#include #include #include #include @@ -123,12 +124,13 @@ int intel_pasid_alloc_table(struct device *dev) struct pasid_table *pasid_table; struct pasid_table_opaque data; struct page *pages; - size_t size, count; + int max_pasid = 0; int ret, order; + int size; + might_sleep(); info = dev->archdata.iommu; - if (WARN_ON(!info || !dev_is_pci(dev) || - !info->pasid_supported || info->pasid_table)) + if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) return -EINVAL; /* DMA alias device already has a pasid table, use it: */ @@ -138,17 +140,19 @@ int intel_pasid_alloc_table(struct device *dev) if (ret) goto attach_out; - pasid_table = kzalloc(sizeof(*pasid_table), GFP_ATOMIC); + pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); if (!pasid_table) return -ENOMEM; INIT_LIST_HEAD(&pasid_table->dev); - size = sizeof(struct pasid_entry); - count = min_t(int, pci_max_pasids(to_pci_dev(dev)), intel_pasid_max_id); - order = get_order(size * count); + if (info->pasid_supported) + max_pasid = min_t(int, pci_max_pasids(to_pci_dev(dev)), + intel_pasid_max_id); + + size = max_pasid >> (PASID_PDE_SHIFT - 3); + order = size ? get_order(size) : 0; pages = alloc_pages_node(info->iommu->node, - GFP_ATOMIC | __GFP_ZERO, - order); + GFP_KERNEL | __GFP_ZERO, order); if (!pages) { kfree(pasid_table); return -ENOMEM; @@ -156,7 +160,7 @@ int intel_pasid_alloc_table(struct device *dev) pasid_table->table = page_address(pages); pasid_table->order = order; - pasid_table->max_pasid = count; + pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); attach_out: device_attach_pasid_table(info, pasid_table); @@ -164,14 +168,33 @@ int intel_pasid_alloc_table(struct device *dev) return 0; } +/* Get PRESENT bit of a PASID directory entry. */ +static inline bool +pasid_pde_is_present(struct pasid_dir_entry *pde) +{ + return READ_ONCE(pde->val) & PASID_PTE_PRESENT; +} + +/* Get PASID table from a PASID directory entry. */ +static inline struct pasid_entry * +get_pasid_table_from_pde(struct pasid_dir_entry *pde) +{ + if (!pasid_pde_is_present(pde)) + return NULL; + + return phys_to_virt(READ_ONCE(pde->val) & PDE_PFN_MASK); +} + void intel_pasid_free_table(struct device *dev) { struct device_domain_info *info; struct pasid_table *pasid_table; + struct pasid_dir_entry *dir; + struct pasid_entry *table; + int i, max_pde; info = dev->archdata.iommu; - if (!info || !dev_is_pci(dev) || - !info->pasid_supported || !info->pasid_table) + if (!info || !dev_is_pci(dev) || !info->pasid_table) return; pasid_table = info->pasid_table; @@ -180,6 +203,14 @@ void intel_pasid_free_table(struct device *dev) if (!list_empty(&pasid_table->dev)) return; + /* Free scalable mode PASID directory tables: */ + dir = pasid_table->table; + max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; + for (i = 0; i < max_pde; i++) { + table = get_pasid_table_from_pde(&dir[i]); + free_pgtable_page(table); + } + free_pages((unsigned long)pasid_table->table, pasid_table->order); kfree(pasid_table); } @@ -208,17 +239,37 @@ int intel_pasid_get_dev_max_id(struct device *dev) struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid) { + struct device_domain_info *info; struct pasid_table *pasid_table; + struct pasid_dir_entry *dir; struct pasid_entry *entries; + int dir_index, index; pasid_table = intel_pasid_get_table(dev); if (WARN_ON(!pasid_table || pasid < 0 || pasid >= intel_pasid_get_dev_max_id(dev))) return NULL; - entries = pasid_table->table; + dir = pasid_table->table; + info = dev->archdata.iommu; + dir_index = pasid >> PASID_PDE_SHIFT; + index = pasid & PASID_PTE_MASK; + + spin_lock(&pasid_lock); + entries = get_pasid_table_from_pde(&dir[dir_index]); + if (!entries) { + entries = alloc_pgtable_page(info->iommu->node); + if (!entries) { + spin_unlock(&pasid_lock); + return NULL; + } + + WRITE_ONCE(dir[dir_index].val, + (u64)virt_to_phys(entries) | PASID_PTE_PRESENT); + } + spin_unlock(&pasid_lock); - return &entries[pasid]; + return &entries[index]; } /* @@ -226,10 +277,17 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid) */ static inline void pasid_clear_entry(struct pasid_entry *pe) { - WRITE_ONCE(pe->val, 0); + WRITE_ONCE(pe->val[0], 0); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); } -void intel_pasid_clear_entry(struct device *dev, int pasid) +static void intel_pasid_clear_entry(struct device *dev, int pasid) { struct pasid_entry *pe; @@ -239,3 +297,361 @@ void intel_pasid_clear_entry(struct device *dev, int pasid) pasid_clear_entry(pe); } + +static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) +{ + u64 old; + + old = READ_ONCE(*ptr); + WRITE_ONCE(*ptr, (old & ~mask) | bits); +} + +/* + * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode + * PASID entry. + */ +static inline void +pasid_set_domain_id(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); +} + +/* + * Get domain ID value of a scalable mode PASID entry. + */ +static inline u16 +pasid_get_domain_id(struct pasid_entry *pe) +{ + return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); +} + +/* + * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_slptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); +} + +/* + * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID + * entry. + */ +static inline void +pasid_set_address_width(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); +} + +/* + * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_translation_type(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); +} + +/* + * Enable fault processing by clearing the FPD(Fault Processing + * Disable) field (Bit 1) of a scalable mode PASID entry. + */ +static inline void pasid_set_fault_enable(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 1, 0); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* + * Setup the P(Present) field (Bit 0) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_present(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 0, 1); +} + +/* + * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) +{ + pasid_set_bits(&pe->val[1], 1 << 23, value); +} + +/* + * Setup the First Level Page table Pointer field (Bit 140~191) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_flptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); +} + +/* + * Setup the First Level Paging Mode field (Bit 130~131) of a + * scalable mode PASID entry. + */ +static inline void +pasid_set_flpm(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); +} + +static void +pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, + u16 did, int pasid) +{ + struct qi_desc desc; + + desc.qw0 = QI_PC_DID(did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid); + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(&desc, iommu); +} + +static void +iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) +{ + struct qi_desc desc; + + desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(&desc, iommu); +} + +static void +devtlb_invalidation_with_pasid(struct intel_iommu *iommu, + struct device *dev, int pasid) +{ + struct device_domain_info *info; + u16 sid, qdep, pfsid; + + info = dev->archdata.iommu; + if (!info || !info->ats_enabled) + return; + + sid = info->bus << 8 | info->devfn; + qdep = info->ats_qdep; + pfsid = info->pfsid; + + qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT); +} + +void intel_pasid_tear_down_entry(struct intel_iommu *iommu, + struct device *dev, int pasid) +{ + struct pasid_entry *pte; + u16 did; + + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte)) + return; + + intel_pasid_clear_entry(dev, pasid); + did = pasid_get_domain_id(pte); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + iotlb_invalidation_with_pasid(iommu, did, pasid); + + /* Device IOTLB doesn't need to be flushed in caching mode. */ + if (!cap_caching_mode(iommu->cap)) + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} + +/* + * Set up the scalable mode pasid table entry for first only + * translation type. + */ +int intel_pasid_setup_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + int pasid, u16 did, int flags) +{ + struct pasid_entry *pte; + + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte)) + return -EINVAL; + + pasid_clear_entry(pte); + + /* Setup the first level page table pointer: */ + pasid_set_flptr(pte, (u64)__pa(pgd)); + if (flags & PASID_FLAG_SUPERVISOR_MODE) { + if (!ecap_srs(iommu->ecap)) { + pr_err("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_sre(pte); + } + +#ifdef CONFIG_X86 + if (cpu_feature_enabled(X86_FEATURE_LA57)) + pasid_set_flpm(pte, 1); +#endif /* CONFIG_X86 */ + + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + /* Setup Present and PASID Granular Transfer Type: */ + pasid_set_translation_type(pte, 1); + pasid_set_present(pte); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + if (cap_caching_mode(iommu->cap)) { + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + iotlb_invalidation_with_pasid(iommu, did, pasid); + } else { + iommu_flush_write_buffer(iommu); + } + + return 0; +} + +/* + * Set up the scalable mode pasid entry for second only translation type. + */ +int intel_pasid_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, int pasid) +{ + struct pasid_entry *pte; + struct dma_pte *pgd; + u64 pgd_val; + int agaw; + u16 did; + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + /* + * Skip top levels of page tables for iommu which has less agaw + * than default. Unnecessary for PT mode. + */ + pgd = domain->pgd; + for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + pgd = phys_to_virt(dma_pte_addr(pgd)); + if (!dma_pte_present(pgd)) { + dev_err(dev, "Invalid domain page table\n"); + return -EINVAL; + } + } + + pgd_val = virt_to_phys(pgd); + did = domain->iommu_did[iommu->seq_id]; + + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid); + return -ENODEV; + } + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_slptr(pte, pgd_val); + pasid_set_address_width(pte, agaw); + pasid_set_translation_type(pte, 2); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + /* + * Since it is a second level only translation setup, we should + * set SRE bit as well (addresses are expected to be GPAs). + */ + pasid_set_sre(pte); + pasid_set_present(pte); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + if (cap_caching_mode(iommu->cap)) { + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + iotlb_invalidation_with_pasid(iommu, did, pasid); + } else { + iommu_flush_write_buffer(iommu); + } + + return 0; +} + +/* + * Set up the scalable mode pasid entry for passthrough translation type. + */ +int intel_pasid_setup_pass_through(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, int pasid) +{ + u16 did = FLPT_DEFAULT_DID; + struct pasid_entry *pte; + + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid); + return -ENODEV; + } + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_translation_type(pte, 4); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + /* + * We should set SRE bit as well since the addresses are expected + * to be GPAs. + */ + pasid_set_sre(pte); + pasid_set_present(pte); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + if (cap_caching_mode(iommu->cap)) { + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + iotlb_invalidation_with_pasid(iommu, did, pasid); + } else { + iommu_flush_write_buffer(iommu); + } + + return 0; +} diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index 1fb5e12b029ac717379ec74544c83254ea0c0259..91e81280e2d7788b89a2dff9e6ca6b68144bc9cf 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -11,12 +11,34 @@ #define __INTEL_PASID_H #define PASID_MIN 0x1 -#define PASID_MAX 0x20000 +#define PASID_MAX 0x100000 +#define PASID_PTE_MASK 0x3F +#define PASID_PTE_PRESENT 1 +#define PDE_PFN_MASK PAGE_MASK +#define PASID_PDE_SHIFT 6 -struct pasid_entry { +/* + * The SUPERVISOR_MODE flag indicates a first level translation which + * can be used for access to kernel addresses. It is valid only for + * access to the kernel's static 1:1 mapping of physical memory — not + * to vmalloc or even module mappings. + */ +#define PASID_FLAG_SUPERVISOR_MODE BIT(0) + +/* + * Domain ID reserved for pasid entries programmed for first-level + * only and pass-through transfer modes. + */ +#define FLPT_DEFAULT_DID 1 + +struct pasid_dir_entry { u64 val; }; +struct pasid_entry { + u64 val[8]; +}; + /* The representative of a PASID table */ struct pasid_table { void *table; /* pasid table pointer */ @@ -34,6 +56,16 @@ void intel_pasid_free_table(struct device *dev); struct pasid_table *intel_pasid_get_table(struct device *dev); int intel_pasid_get_dev_max_id(struct device *dev); struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid); -void intel_pasid_clear_entry(struct device *dev, int pasid); +int intel_pasid_setup_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + int pasid, u16 did, int flags); +int intel_pasid_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, int pasid); +int intel_pasid_setup_pass_through(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, int pasid); +void intel_pasid_tear_down_entry(struct intel_iommu *iommu, + struct device *dev, int pasid); #endif /* __INTEL_PASID_H */ diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 5d284c00dd27ba394d4fb61f3f0625b63042bcb8..aff6538a2eb8bcfcc2d15c4477a57e227b295a70 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -29,10 +29,6 @@ #include "intel-pasid.h" -#define PASID_ENTRY_P BIT_ULL(0) -#define PASID_ENTRY_FLPM_5LP BIT_ULL(9) -#define PASID_ENTRY_SRE BIT_ULL(11) - static irqreturn_t prq_event_thread(int irq, void *d); struct pasid_state_entry { @@ -65,8 +61,6 @@ int intel_svm_init(struct intel_iommu *iommu) order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max); if (ecap_dis(iommu->ecap)) { - /* Just making it explicit... */ - BUILD_BUG_ON(sizeof(struct pasid_entry) != sizeof(struct pasid_state_entry)); pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (pages) iommu->pasid_state_table = page_address(pages); @@ -163,27 +157,40 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d * because that's the only option the hardware gives us. Despite * the fact that they are actually only accessible through one. */ if (gl) - desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | - QI_EIOTLB_GRAN(QI_GRAN_ALL_ALL) | QI_EIOTLB_TYPE; + desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | + QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_ALL_ALL) | + QI_EIOTLB_TYPE; else - desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE; - desc.high = 0; + desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | + QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc.qw1 = 0; } else { int mask = ilog2(__roundup_pow_of_two(pages)); - desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | - QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | QI_EIOTLB_TYPE; - desc.high = QI_EIOTLB_ADDR(address) | QI_EIOTLB_GL(gl) | - QI_EIOTLB_IH(ih) | QI_EIOTLB_AM(mask); + desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | + QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | + QI_EIOTLB_TYPE; + desc.qw1 = QI_EIOTLB_ADDR(address) | + QI_EIOTLB_GL(gl) | + QI_EIOTLB_IH(ih) | + QI_EIOTLB_AM(mask); } + desc.qw2 = 0; + desc.qw3 = 0; qi_submit_sync(&desc, svm->iommu); if (sdev->dev_iotlb) { - desc.low = QI_DEV_EIOTLB_PASID(svm->pasid) | QI_DEV_EIOTLB_SID(sdev->sid) | - QI_DEV_EIOTLB_QDEP(sdev->qdep) | QI_DEIOTLB_TYPE; + desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | + QI_DEV_EIOTLB_SID(sdev->sid) | + QI_DEV_EIOTLB_QDEP(sdev->qdep) | + QI_DEIOTLB_TYPE; if (pages == -1) { - desc.high = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | QI_DEV_EIOTLB_SIZE; + desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | + QI_DEV_EIOTLB_SIZE; } else if (pages > 1) { /* The least significant zero bit indicates the size. So, * for example, an "address" value of 0x12345f000 will @@ -191,10 +198,13 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT); unsigned long mask = __rounddown_pow_of_two(address ^ last); - desc.high = QI_DEV_EIOTLB_ADDR((address & ~mask) | (mask - 1)) | QI_DEV_EIOTLB_SIZE; + desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) | + (mask - 1)) | QI_DEV_EIOTLB_SIZE; } else { - desc.high = QI_DEV_EIOTLB_ADDR(address); + desc.qw1 = QI_DEV_EIOTLB_ADDR(address); } + desc.qw2 = 0; + desc.qw3 = 0; qi_submit_sync(&desc, svm->iommu); } } @@ -234,17 +244,6 @@ static void intel_invalidate_range(struct mmu_notifier *mn, (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0, 0); } - -static void intel_flush_pasid_dev(struct intel_svm *svm, struct intel_svm_dev *sdev, int pasid) -{ - struct qi_desc desc; - - desc.high = 0; - desc.low = QI_PC_TYPE | QI_PC_DID(sdev->did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid); - - qi_submit_sync(&desc, svm->iommu); -} - static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); @@ -264,8 +263,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) */ rcu_read_lock(); list_for_each_entry_rcu(sdev, &svm->devs, list) { - intel_pasid_clear_entry(sdev->dev, svm->pasid); - intel_flush_pasid_dev(svm, sdev, svm->pasid); + intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm); } rcu_read_unlock(); @@ -285,11 +283,10 @@ static LIST_HEAD(global_svm_list); int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); - struct pasid_entry *entry; + struct device_domain_info *info; struct intel_svm_dev *sdev; struct intel_svm *svm = NULL; struct mm_struct *mm = NULL; - u64 pasid_entry_val; int pasid_max; int ret; @@ -350,13 +347,29 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } sdev->dev = dev; - ret = intel_iommu_enable_pasid(iommu, sdev); + ret = intel_iommu_enable_pasid(iommu, dev); if (ret || !pasid) { /* If they don't actually want to assign a PASID, this is * just an enabling check/preparation. */ kfree(sdev); goto out; } + + info = dev->archdata.iommu; + if (!info || !info->pasid_supported) { + kfree(sdev); + goto out; + } + + sdev->did = FLPT_DEFAULT_DID; + sdev->sid = PCI_DEVID(info->bus, info->devfn); + if (info->ats_enabled) { + sdev->dev_iotlb = 1; + sdev->qdep = info->ats_qdep; + if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) + sdev->qdep = 0; + } + /* Finish the setup now we know we're keeping it */ sdev->users = 1; sdev->ops = ops; @@ -398,25 +411,22 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ kfree(sdev); goto out; } - pasid_entry_val = (u64)__pa(mm->pgd) | PASID_ENTRY_P; - } else - pasid_entry_val = (u64)__pa(init_mm.pgd) | - PASID_ENTRY_P | PASID_ENTRY_SRE; - if (cpu_feature_enabled(X86_FEATURE_LA57)) - pasid_entry_val |= PASID_ENTRY_FLPM_5LP; - - entry = intel_pasid_get_entry(dev, svm->pasid); - entry->val = pasid_entry_val; - - wmb(); - - /* - * Flush PASID cache when a PASID table entry becomes - * present. - */ - if (cap_caching_mode(iommu->cap)) - intel_flush_pasid_dev(svm, sdev, svm->pasid); + } + spin_lock(&iommu->lock); + ret = intel_pasid_setup_first_level(iommu, dev, + mm ? mm->pgd : init_mm.pgd, + svm->pasid, FLPT_DEFAULT_DID, + mm ? 0 : PASID_FLAG_SUPERVISOR_MODE); + spin_unlock(&iommu->lock); + if (ret) { + if (mm) + mmu_notifier_unregister(&svm->notifier, mm); + intel_pasid_free_id(svm->pasid); + kfree(svm); + kfree(sdev); + goto out; + } list_add_tail(&svm->list, &global_svm_list); } list_add_rcu(&sdev->list, &svm->devs); @@ -461,10 +471,9 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) * to use. We have a *shared* PASID table, because it's * large and has to be physically contiguous. So it's * hard to be as defensive as we might like. */ - intel_flush_pasid_dev(svm, sdev, svm->pasid); + intel_pasid_tear_down_entry(iommu, dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm); kfree_rcu(sdev, rcu); - intel_pasid_clear_entry(dev, svm->pasid); if (list_empty(&svm->devs)) { intel_pasid_free_id(svm->pasid); @@ -673,24 +682,27 @@ static irqreturn_t prq_event_thread(int irq, void *d) no_pasid: if (req->lpig) { /* Page Group Response */ - resp.low = QI_PGRP_PASID(req->pasid) | + resp.qw0 = QI_PGRP_PASID(req->pasid) | QI_PGRP_DID((req->bus << 8) | req->devfn) | QI_PGRP_PASID_P(req->pasid_present) | QI_PGRP_RESP_TYPE; - resp.high = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_PRIV(req->private) | QI_PGRP_RESP_CODE(result); - - qi_submit_sync(&resp, iommu); + resp.qw1 = QI_PGRP_IDX(req->prg_index) | + QI_PGRP_PRIV(req->private) | + QI_PGRP_RESP_CODE(result); } else if (req->srr) { /* Page Stream Response */ - resp.low = QI_PSTRM_IDX(req->prg_index) | - QI_PSTRM_PRIV(req->private) | QI_PSTRM_BUS(req->bus) | - QI_PSTRM_PASID(req->pasid) | QI_PSTRM_RESP_TYPE; - resp.high = QI_PSTRM_ADDR(address) | QI_PSTRM_DEVFN(req->devfn) | + resp.qw0 = QI_PSTRM_IDX(req->prg_index) | + QI_PSTRM_PRIV(req->private) | + QI_PSTRM_BUS(req->bus) | + QI_PSTRM_PASID(req->pasid) | + QI_PSTRM_RESP_TYPE; + resp.qw1 = QI_PSTRM_ADDR(address) | + QI_PSTRM_DEVFN(req->devfn) | QI_PSTRM_RESP_CODE(result); - - qi_submit_sync(&resp, iommu); } + resp.qw2 = 0; + resp.qw3 = 0; + qi_submit_sync(&resp, iommu); head = (head + sizeof(*req)) & PRQ_RING_MASK; } diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 967450bd421a171c97adec77faf50c92d46a5c67..916391f33ca675f401dc7123b642d336d47b3035 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -145,9 +145,11 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) { struct qi_desc desc; - desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask) + desc.qw0 = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask) | QI_IEC_SELECTIVE; - desc.high = 0; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; return qi_submit_sync(&desc, iommu); } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 84a59762b037ab30b383b8ec48de9fa17d4d75e1..de743904dec35f6439985610a832ef71306dca67 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -43,6 +43,8 @@ static unsigned int iommu_def_domain_type = IOMMU_DOMAIN_DMA; #endif static bool iommu_dma_strict __read_mostly; +static u32 iommu_cmd_line __read_mostly; + struct iommu_callback_data { const struct iommu_ops *ops; }; @@ -84,6 +86,18 @@ static const char * const iommu_group_resv_type_string[] = { [IOMMU_RESV_SW_MSI] = "msi", }; +#define IOMMU_CMD_LINE_DMA_API BIT(0) + +static void iommu_set_cmd_line_dma_api(void) +{ + iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API; +} + +static bool __maybe_unused iommu_cmd_line_dma_api(void) +{ + return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API); +} + #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ __ATTR(_name, _mode, _show, _store) @@ -130,6 +144,8 @@ static int __init iommu_set_def_domain_type(char *str) if (ret) return ret; + iommu_set_cmd_line_dma_api(); + iommu_def_domain_type = pt ? IOMMU_DOMAIN_IDENTITY : IOMMU_DOMAIN_DMA; return 0; } @@ -2235,10 +2251,10 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, return region; } -/* Request that a device is direct mapped by the IOMMU */ -int iommu_request_dm_for_dev(struct device *dev) +static int +request_default_domain_for_dev(struct device *dev, unsigned long type) { - struct iommu_domain *dm_domain; + struct iommu_domain *domain; struct iommu_group *group; int ret; @@ -2251,8 +2267,7 @@ int iommu_request_dm_for_dev(struct device *dev) /* Check if the default domain is already direct mapped */ ret = 0; - if (group->default_domain && - group->default_domain->type == IOMMU_DOMAIN_IDENTITY) + if (group->default_domain && group->default_domain->type == type) goto out; /* Don't change mappings of existing devices */ @@ -2262,23 +2277,26 @@ int iommu_request_dm_for_dev(struct device *dev) /* Allocate a direct mapped domain */ ret = -ENOMEM; - dm_domain = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_IDENTITY); - if (!dm_domain) + domain = __iommu_domain_alloc(dev->bus, type); + if (!domain) goto out; /* Attach the device to the domain */ - ret = __iommu_attach_group(dm_domain, group); + ret = __iommu_attach_group(domain, group); if (ret) { - iommu_domain_free(dm_domain); + iommu_domain_free(domain); goto out; } /* Make the direct mapped domain the default for this group */ if (group->default_domain) iommu_domain_free(group->default_domain); - group->default_domain = dm_domain; + group->default_domain = domain; + + iommu_group_create_direct_mappings(group, dev); - pr_info("Using direct mapping for device %s\n", dev_name(dev)); + dev_info(dev, "Using iommu %s mapping\n", + type == IOMMU_DOMAIN_DMA ? "dma" : "direct"); ret = 0; out: @@ -2289,6 +2307,40 @@ int iommu_request_dm_for_dev(struct device *dev) } EXPORT_SYMBOL_GPL(iommu_request_dm_for_dev); +/* Request that a device is direct mapped by the IOMMU */ +int iommu_request_dm_for_dev(struct device *dev) +{ + return request_default_domain_for_dev(dev, IOMMU_DOMAIN_IDENTITY); +} + +/* Request that a device can't be direct mapped by the IOMMU */ +int iommu_request_dma_domain_for_dev(struct device *dev) +{ + return request_default_domain_for_dev(dev, IOMMU_DOMAIN_DMA); +} + +void iommu_set_default_passthrough(bool cmd_line) +{ + if (cmd_line) + iommu_set_cmd_line_dma_api(); + + iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY; +} + +void iommu_set_default_translated(bool cmd_line) +{ + if (cmd_line) + iommu_set_cmd_line_dma_api(); + + iommu_def_domain_type = IOMMU_DOMAIN_DMA; +} + +bool iommu_default_passthrough(void) +{ + return iommu_def_domain_type == IOMMU_DOMAIN_IDENTITY; +} +EXPORT_SYMBOL_GPL(iommu_default_passthrough); + const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) { const struct iommu_ops *ops = NULL; @@ -2362,6 +2414,102 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) } EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); +/* + * Per device IOMMU features. + */ +bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + + if (ops && ops->dev_has_feat) + return ops->dev_has_feat(dev, feat); + + return false; +} +EXPORT_SYMBOL_GPL(iommu_dev_has_feature); + +int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + + if (ops && ops->dev_enable_feat) + return ops->dev_enable_feat(dev, feat); + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(iommu_dev_enable_feature); + +/* + * The device drivers should do the necessary cleanups before calling this. + * For example, before disabling the aux-domain feature, the device driver + * should detach all aux-domains. Otherwise, this will return -EBUSY. + */ +int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + + if (ops && ops->dev_disable_feat) + return ops->dev_disable_feat(dev, feat); + + return -EBUSY; +} +EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); + +bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + + if (ops && ops->dev_feat_enabled) + return ops->dev_feat_enabled(dev, feat); + + return false; +} +EXPORT_SYMBOL_GPL(iommu_dev_feature_enabled); + +/* + * Aux-domain specific attach/detach. + * + * Only works if iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX) returns + * true. Also, as long as domains are attached to a device through this + * interface, any tries to call iommu_attach_device() should fail + * (iommu_detach_device() can't fail, so we fail when trying to re-attach). + * This should make us safe against a device being attached to a guest as a + * whole while there are still pasid users on it (aux and sva). + */ +int iommu_aux_attach_device(struct iommu_domain *domain, struct device *dev) +{ + int ret = -ENODEV; + + if (domain->ops->aux_attach_dev) + ret = domain->ops->aux_attach_dev(domain, dev); + + if (!ret) + trace_attach_device_to_domain(dev); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_aux_attach_device); + +void iommu_aux_detach_device(struct iommu_domain *domain, struct device *dev) +{ + if (domain->ops->aux_detach_dev) { + domain->ops->aux_detach_dev(domain, dev); + trace_detach_device_from_domain(dev); + } +} +EXPORT_SYMBOL_GPL(iommu_aux_detach_device); + +int iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) +{ + int ret = -ENODEV; + + if (domain->ops->aux_get_pasid) + ret = domain->ops->aux_get_pasid(domain, dev); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_aux_get_pasid); + /** * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index e052f62fdea7e8151cf4fd2b3d650805161da341..26de2a690cf1c81aa34f30bd3453b58646fe2ca9 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -266,13 +266,14 @@ EXPORT_SYMBOL(mdev_unregister_device); static void mdev_device_release(struct device *dev) { struct mdev_device *mdev = to_mdev_device(dev); + struct mdev_device_wapper *mdw = container_of(mdev, struct mdev_device_wapper, mdev); mutex_lock(&mdev_list_lock); list_del(&mdev->next); mutex_unlock(&mdev_list_lock); dev_dbg(&mdev->dev, "MDEV: destroying\n"); - kfree(mdev); + kfree(mdw); } int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) @@ -281,6 +282,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) struct mdev_device *mdev, *tmp; struct mdev_parent *parent; struct mdev_type *type = to_mdev_type(kobj); + struct mdev_device_wapper *mdw; parent = mdev_get_parent(type->parent); if (!parent) @@ -297,12 +299,13 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) } } - mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); - if (!mdev) { + mdw = kzalloc(sizeof(*mdw), GFP_KERNEL); + if (!mdw) { mutex_unlock(&mdev_list_lock); ret = -ENOMEM; goto mdev_fail; } + mdev = &mdw->mdev; memcpy(&mdev->uuid, &uuid, sizeof(uuid_le)); list_add(&mdev->next, &mdev_list); @@ -389,6 +392,26 @@ int mdev_device_remove(struct device *dev, bool force_remove) return 0; } +int mdev_set_iommu_device(struct device *dev, struct device *iommu_device) +{ + struct mdev_device *mdev = to_mdev_device(dev); + struct mdev_device_wapper *mdw = container_of(mdev, struct mdev_device_wapper, mdev); + + mdw->iommu_device = iommu_device; + + return 0; +} +EXPORT_SYMBOL(mdev_set_iommu_device); + +struct device *mdev_get_iommu_device(struct device *dev) +{ + struct mdev_device *mdev = to_mdev_device(dev); + struct mdev_device_wapper *mdw = container_of(mdev, struct mdev_device_wapper, mdev); + + return mdw->iommu_device; +} +EXPORT_SYMBOL(mdev_get_iommu_device); + static int __init mdev_init(void) { return mdev_bus_register(); diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index b5819b7d7ef7016f2467f07b57495be54ce33940..cf2c25ed9800771fad77b2981b731fc9a731db4a 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -36,6 +36,11 @@ struct mdev_device { bool active; }; +struct mdev_device_wapper { + struct mdev_device mdev; + struct device *iommu_device; +}; + #define to_mdev_device(dev) container_of(dev, struct mdev_device, dev) #define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5a106963dd088a7a707540fcac8fbc8983561154..c2c05098b256392f46ca006f9650de68d36b4c2e 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -101,6 +101,7 @@ struct vfio_group { struct iommu_group *iommu_group; struct list_head next; bool sva_enabled; + bool mdev_group; /* An mdev group */ }; struct vfio_mm { @@ -598,7 +599,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, mutex_lock(&iommu->lock); /* Fail if notifier list is empty */ - if ((!iommu->external_domain) || (!iommu->notifier.head)) { + if (!iommu->notifier.head) { ret = -EINVAL; goto pin_done; } @@ -681,11 +682,6 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data, mutex_lock(&iommu->lock); - if (!iommu->external_domain) { - mutex_unlock(&iommu->lock); - return -EINVAL; - } - do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); for (i = 0; i < npage; i++) { struct vfio_dma *dma; @@ -1637,6 +1633,102 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base) return ret; } +static struct device *vfio_mdev_get_iommu_device(struct device *dev) +{ + struct device *(*fn)(struct device *dev); + struct device *iommu_device; + + fn = symbol_get(mdev_get_iommu_device); + if (fn) { + iommu_device = fn(dev); + symbol_put(mdev_get_iommu_device); + + return iommu_device; + } + + return NULL; +} + +static int vfio_mdev_attach_domain(struct device *dev, void *data) +{ + struct iommu_domain *domain = data; + struct device *iommu_device; + + iommu_device = vfio_mdev_get_iommu_device(dev); + if (iommu_device) { + if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX)) + return iommu_aux_attach_device(domain, iommu_device); + else + return iommu_attach_device(domain, iommu_device); + } + + return -EINVAL; +} + +static int vfio_mdev_detach_domain(struct device *dev, void *data) +{ + struct iommu_domain *domain = data; + struct device *iommu_device; + + iommu_device = vfio_mdev_get_iommu_device(dev); + if (iommu_device) { + if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX)) + iommu_aux_detach_device(domain, iommu_device); + else + iommu_detach_device(domain, iommu_device); + } + + return 0; +} + +static int vfio_iommu_attach_group(struct vfio_domain *domain, + struct vfio_group *group) +{ + if (group->mdev_group) + return iommu_group_for_each_dev(group->iommu_group, + domain->domain, + vfio_mdev_attach_domain); + else + return iommu_attach_group(domain->domain, group->iommu_group); +} + +static void vfio_iommu_detach_group(struct vfio_domain *domain, + struct vfio_group *group) +{ + if (group->mdev_group) + iommu_group_for_each_dev(group->iommu_group, domain->domain, + vfio_mdev_detach_domain); + else + iommu_detach_group(domain->domain, group->iommu_group); +} + +static bool vfio_bus_is_mdev(struct bus_type *bus) +{ + struct bus_type *mdev_bus; + bool ret = false; + + mdev_bus = symbol_get(mdev_bus_type); + if (mdev_bus) { + ret = (bus == mdev_bus); + symbol_put(mdev_bus_type); + } + + return ret; +} + +static int vfio_mdev_iommu_device(struct device *dev, void *data) +{ + struct device **old = data, *new; + + new = vfio_mdev_get_iommu_device(dev); + if (!new || (*old && *old != new)) + return -EINVAL; + + *old = new; + + return 0; +} + static int vfio_iommu_try_attach_group(struct vfio_iommu *iommu, struct vfio_group *group, struct vfio_domain *cur_domain, @@ -1681,7 +1773,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, struct vfio_iommu *iommu = iommu_data; struct vfio_group *group; struct vfio_domain *domain, *d; - struct bus_type *bus = NULL, *mdev_bus; + struct bus_type *bus = NULL; int ret; bool resv_msi, msi_remap; phys_addr_t resv_msi_base; @@ -1716,23 +1808,27 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_free; - mdev_bus = symbol_get(mdev_bus_type); + if (vfio_bus_is_mdev(bus)) { + struct device *iommu_device = NULL; + group->mdev_group = true; - if (mdev_bus) { - if ((bus == mdev_bus) && !iommu_present(bus)) { - symbol_put(mdev_bus_type); + /* Determine the isolation type */ + ret = iommu_group_for_each_dev(iommu_group, &iommu_device, + vfio_mdev_iommu_device); + if (ret || !iommu_device) { if (!iommu->external_domain) { INIT_LIST_HEAD(&domain->group_list); iommu->external_domain = domain; - } else + } else { kfree(domain); + } list_add(&group->next, &iommu->external_domain->group_list); mutex_unlock(&iommu->lock); return 0; } - symbol_put(mdev_bus_type); + bus = iommu_device->bus; } domain->domain = iommu_domain_alloc(bus); @@ -1750,7 +1846,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_domain; } - ret = iommu_attach_group(domain->domain, iommu_group); + ret = vfio_iommu_attach_group(domain, group); if (ret) goto out_domain; @@ -1773,15 +1869,19 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, domain->prot |= IOMMU_CACHE; list_for_each_entry(d, &iommu->domain_list, next) { - ret = vfio_iommu_try_attach_group(iommu, group, domain, d); - if (ret < 0) { - goto out_domain; - } else if (!ret) { - list_add(&group->next, &d->group_list); - iommu_domain_free(domain->domain); - kfree(domain); - mutex_unlock(&iommu->lock); - return 0; + if (d->domain->ops == domain->domain->ops && + d->prot == domain->prot) { + vfio_iommu_detach_group(domain, group); + if (!vfio_iommu_attach_group(d, group)) { + list_add(&group->next, &d->group_list); + iommu_domain_free(domain->domain); + kfree(domain); + mutex_unlock(&iommu->lock); + return 0; + } + ret = vfio_iommu_attach_group(domain, group); + if (ret) + goto out_domain; } } @@ -1809,7 +1909,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, return 0; out_detach: - iommu_detach_group(domain->domain, iommu_group); + vfio_iommu_detach_group(domain, group); out_domain: iommu_domain_free(domain->domain); out_free: @@ -1902,7 +2002,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, if (!group) continue; - iommu_detach_group(domain->domain, iommu_group); + vfio_iommu_detach_group(domain, group); if (group->sva_enabled) { iommu_group_for_each_dev(iommu_group, NULL, vfio_iommu_sva_shutdown); @@ -1976,7 +2076,7 @@ static void vfio_release_domain(struct vfio_domain *domain, bool external) list_for_each_entry_safe(group, group_tmp, &domain->group_list, next) { if (!external) - iommu_detach_group(domain->domain, group->iommu_group); + vfio_iommu_detach_group(domain, group); list_del(&group->next); kfree(group); } diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 21b3e7d33d6809ee1f888ac8aef38203fe27a1c4..d46babf3c1c4f2834b8236526727309054f99e9d 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -28,7 +28,6 @@ #define CONTEXT_DINVE (1ULL << 8) #define CONTEXT_PRS (1ULL << 9) -#define CONTEXT_PASIDE (1ULL << 11) struct intel_iommu; struct dmar_domain; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index b1b4411b4c6b8d8f867a399f40a66552ca5b76cd..b88703a79ab3ef6d0bec8d7ea7f8b22a658d691d 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -36,6 +36,8 @@ #include #include +#define CONTEXT_PASIDE BIT_ULL(3) + /* * Intel IOMMU register specification per version 1.0 public spec. */ @@ -84,6 +86,10 @@ /* * Decoding Capability Register */ +#define ecap_smts(e) (((e) >> 43) & 0x1) +#define ecap_smpwc(e) (((e) >> 48) & 0x1) +#define ecap_flts(e) (((e) >> 47) & 0x1) +#define ecap_slts(e) (((e) >> 46) & 0x1) #define cap_5lp_support(c) (((c) >> 60) & 1) #define cap_pi_support(c) (((c) >> 59) & 1) #define cap_fl1gp_support(c) (((c) >> 56) & 1) @@ -340,13 +346,18 @@ enum { #define QI_GRAN_NONG_PASID 2 #define QI_GRAN_PSI_PASID 3 +#define qi_shift(iommu) (DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap)) + struct qi_desc { - u64 low, high; + u64 qw0; + u64 qw1; + u64 qw2; + u64 qw3; }; struct q_inval { raw_spinlock_t q_lock; - struct qi_desc *desc; /* invalidation queue */ + void *desc; /* invalidation queue */ int *desc_status; /* desc status */ int free_head; /* first free entry */ int free_tail; /* last free entry */ @@ -402,9 +413,11 @@ struct dmar_domain { /* Domain ids per IOMMU. Use u16 since * domain ids are 16 bit wide according * to VT-d spec, section 9.3 */ + unsigned int auxd_refcnt; /* Refcount of auxiliary attaching */ bool has_iotlb_device; struct list_head devices; /* all devices' list */ + struct list_head auxd; /* link to device's auxiliary list */ struct iova_domain iovad; /* iova's that belong to this domain */ struct dma_pte *pgd; /* virtual address */ @@ -423,6 +436,11 @@ struct dmar_domain { 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ u64 max_addr; /* maximum mapped address */ + int default_pasid; /* + * The default pasid used for non-SVM + * traffic on mediated devices. + */ + struct iommu_domain domain; /* generic domain data structure for iommu core */ }; @@ -479,6 +497,9 @@ struct device_domain_info { struct list_head link; /* link to domain siblings */ struct list_head global; /* link to global list */ struct list_head table; /* link to pasid table */ + struct list_head auxiliary_domains; /* auxiliary domains + * attached to this device + */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ u16 pfsid; /* SRIOV physical function source ID */ @@ -488,6 +509,7 @@ struct device_domain_info { u8 pri_enabled:1; u8 ats_supported:1; u8 ats_enabled:1; + u8 auxd_enabled:1; /* Multiple domains per device */ u8 ats_qdep; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ @@ -502,6 +524,49 @@ static inline void __iommu_flush_cache( clflush_cache_range(addr, size); } +/* + * 0: readable + * 1: writable + * 2-6: reserved + * 7: super page + * 8-10: available + * 11: snoop behavior + * 12-63: Host physcial address + */ +struct dma_pte { + u64 val; +}; + +static inline void dma_clear_pte(struct dma_pte *pte) +{ + pte->val = 0; +} + +static inline u64 dma_pte_addr(struct dma_pte *pte) +{ +#ifdef CONFIG_64BIT + return pte->val & VTD_PAGE_MASK; +#else + /* Must have a full atomic 64-bit read */ + return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK; +#endif +} + +static inline bool dma_pte_present(struct dma_pte *pte) +{ + return (pte->val & 3) != 0; +} + +static inline bool dma_pte_superpage(struct dma_pte *pte) +{ + return (pte->val & DMA_PTE_LARGE_PAGE); +} + +static inline int first_pte_in_page(struct dma_pte *pte) +{ + return !((unsigned long)pte & ~VTD_PAGE_MASK); +} + extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); extern int dmar_find_matched_atsr_unit(struct pci_dev *dev); @@ -520,12 +585,13 @@ extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); -struct dmar_domain *get_valid_domain_for_dev(struct device *dev); void *alloc_pgtable_page(int node); void free_pgtable_page(void *vaddr); struct intel_iommu *domain_get_iommu(struct dmar_domain *domain); int for_each_device_domain(int (*fn)(struct device_domain_info *info, void *data), void *data); +void iommu_flush_write_buffer(struct intel_iommu *iommu); +int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); #ifdef CONFIG_INTEL_IOMMU_SVM int intel_svm_init(struct intel_iommu *iommu); @@ -556,7 +622,6 @@ struct intel_svm { struct list_head list; }; -extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev); extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev); #endif diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d44f3a6762bef66d053f0ee2b92dac32b1e29765..3c690a40d7bde9cae10eb5d52a60ba2ec844c7bf 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -184,6 +184,11 @@ struct iommu_resv_region { enum iommu_resv_type type; }; +/* Per device IOMMU features */ +enum iommu_dev_features { + IOMMU_DEV_FEAT_AUX, /* Aux-domain feature */ +}; + /** * enum page_response_code - Return status of fault handlers, telling the IOMMU * driver how to proceed with the fault. @@ -361,6 +366,19 @@ struct iommu_ops { struct device *dev, struct tlb_invalidate_info *inv_info); int (*page_response)(struct device *dev, struct page_response_msg *msg); +#ifndef __GENKSYMS__ + /* Per device IOMMU features */ + bool (*dev_has_feat)(struct device *dev, enum iommu_dev_features f); + bool (*dev_feat_enabled)(struct device *dev, enum iommu_dev_features f); + int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); + int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); + + /* Aux-domain specific attach/detach entries */ + int (*aux_attach_dev)(struct iommu_domain *domain, struct device *dev); + void (*aux_detach_dev)(struct iommu_domain *domain, struct device *dev); + int (*aux_get_pasid)(struct iommu_domain *domain, struct device *dev); +#endif + unsigned long pgsize_bitmap; #ifdef CONFIG_SMMU_BYPASS_DEV @@ -559,6 +577,10 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain, extern void iommu_get_resv_regions(struct device *dev, struct list_head *list); extern void iommu_put_resv_regions(struct device *dev, struct list_head *list); extern int iommu_request_dm_for_dev(struct device *dev); +extern int iommu_request_dma_domain_for_dev(struct device *dev); +extern void iommu_set_default_passthrough(bool cmd_line); +extern void iommu_set_default_translated(bool cmd_line); +extern bool iommu_default_passthrough(void); extern struct iommu_resv_region * iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, enum iommu_resv_type type); @@ -675,6 +697,14 @@ extern int iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, int *pasid, unsigned long flags, void *drvdata); extern int iommu_sva_unbind_device(struct device *dev, int pasid); +bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features f); +int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); +int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); +bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features f); +int iommu_aux_attach_device(struct iommu_domain *domain, struct device *dev); +void iommu_aux_detach_device(struct iommu_domain *domain, struct device *dev); +int iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev); + #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -804,6 +834,24 @@ static inline int iommu_request_dm_for_dev(struct device *dev) return -ENODEV; } +static inline int iommu_request_dma_domain_for_dev(struct device *dev) +{ + return -ENODEV; +} + +static inline void iommu_set_default_passthrough(bool cmd_line) +{ +} + +static inline void iommu_set_default_translated(bool cmd_line) +{ +} + +static inline bool iommu_default_passthrough(void) +{ + return true; +} + static inline int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { @@ -984,6 +1032,47 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) return NULL; } +static inline bool +iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat) +{ + return false; +} + +static inline bool +iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat) +{ + return false; +} + +static inline int +iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) +{ + return -ENODEV; +} + +static inline int +iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) +{ + return -ENODEV; +} + +static inline int +iommu_aux_attach_device(struct iommu_domain *domain, struct device *dev) +{ + return -ENODEV; +} + +static inline void +iommu_aux_detach_device(struct iommu_domain *domain, struct device *dev) +{ +} + +static inline int +iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) +{ + return -ENODEV; +} + static inline struct iommu_domain *iommu_group_share_domain(struct iommu_group *group) { diff --git a/include/linux/mdev.h b/include/linux/mdev.h index b6e048e1045f99868642b920a005d2382e6dc316..c3ab8a9cfcc72f00c9e045721f51a6cbecf70d90 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -15,6 +15,20 @@ struct mdev_device; +/* + * Called by the parent device driver to set the device which represents + * this mdev in iommu protection scope. By default, the iommu device is + * NULL, that indicates using vendor defined isolation. + * + * @dev: the mediated device that iommu will isolate. + * @iommu_device: a pci device which represents the iommu for @dev. + * + * Return 0 for success, otherwise negative error value. + */ +int mdev_set_iommu_device(struct device *dev, struct device *iommu_device); + +struct device *mdev_get_iommu_device(struct device *dev); + /** * struct mdev_parent_ops - Structure to be registered for each parent device to * register the device to mdev module.