Merge: KVM: Performance Enhanced Refresh PCI Translation

MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/7076

JIRA: https://issues.redhat.com/browse/RHEL-11431

Depends: !6968

Commits:
```
807c2743035446cf0484772a76e1c35ce27fd8e3
64af12c6ec3afd7d44bc8b2044eee59f98059087
0ed5967a0a63552e5ca0f2382ce362f84e83801c
d236843a6964dc5a55dbafa5cfae63bc99cf10f8
6d52cb738a98df6d5a91d96ce5bc557d24343964
81244074b518aeb90de5f68e7e825564c29c5c50
d5fbc5efbd98a6c49f9acfca8595efd0f1e59ad8
1fe3f3cad530981e8ff36cefc19c8534bfd401b3
a2392b8f1ffc26ac4d9fc5cbf081e91a2e39ca71
83c1aec21064c762a2ee475f0fd19f5241de25bc
```

Signed-off-by: Christoph Schlameuss <cschlame@redhat.com>

Approved-by: Thomas Huth <thuth@redhat.com>
Approved-by: Cornelia Huck <cohuck@redhat.com>
Approved-by: Jerry Snitselaar <jsnitsel@redhat.com>
Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com>

Merged-by: Augusto Caringi <acaringi@redhat.com>
This commit is contained in:
Augusto Caringi 2025-07-21 18:52:36 -03:00
commit da5db8f9a8
9 changed files with 479 additions and 100 deletions

View File

@ -144,7 +144,7 @@ struct zpci_dev {
u8 util_str_avail : 1;
u8 irqs_registered : 1;
u8 tid_avail : 1;
u8 reserved : 1;
u8 rtr_avail : 1; /* Relaxed translation allowed */
unsigned int devfn; /* DEVFN part of the RID*/
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
@ -217,6 +217,7 @@ extern struct airq_iv *zpci_aif_sbv;
struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
int zpci_add_device(struct zpci_dev *zdev);
int zpci_enable_device(struct zpci_dev *);
int zpci_reenable_device(struct zpci_dev *zdev);
int zpci_disable_device(struct zpci_dev *);
int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
int zpci_deconfigure_device(struct zpci_dev *zdev);
@ -245,6 +246,7 @@ void update_uid_checking(bool new);
/* IOMMU Interface */
int zpci_init_iommu(struct zpci_dev *zdev);
void zpci_destroy_iommu(struct zpci_dev *zdev);
int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status);
#ifdef CONFIG_PCI
static inline bool zpci_use_mio(struct zpci_dev *zdev)

View File

@ -156,7 +156,9 @@ struct clp_rsp_query_pci_grp {
u16 : 4;
u16 noi : 12; /* number of interrupts */
u8 version;
u8 : 6;
u8 : 2;
u8 rtr : 1; /* Relaxed translation requirement */
u8 : 3;
u8 frame : 1;
u8 refresh : 1; /* TLB refresh mode */
u16 : 3;

View File

@ -25,6 +25,7 @@ enum zpci_ioat_dtype {
#define ZPCI_KEY (PAGE_DEFAULT_KEY << 5)
#define ZPCI_TABLE_SIZE_RT (1UL << 42)
#define ZPCI_TABLE_SIZE_RS (1UL << 53)
#define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST)
#define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT)
@ -55,6 +56,8 @@ enum zpci_ioat_dtype {
#define ZPCI_PT_BITS 8
#define ZPCI_ST_SHIFT (ZPCI_PT_BITS + PAGE_SHIFT)
#define ZPCI_RT_SHIFT (ZPCI_ST_SHIFT + ZPCI_TABLE_BITS)
#define ZPCI_RS_SHIFT (ZPCI_RT_SHIFT + ZPCI_TABLE_BITS)
#define ZPCI_RF_SHIFT (ZPCI_RS_SHIFT + ZPCI_TABLE_BITS)
#define ZPCI_RTE_FLAG_MASK 0x3fffUL
#define ZPCI_RTE_ADDR_MASK (~ZPCI_RTE_FLAG_MASK)

View File

@ -433,7 +433,6 @@ static void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm)
{
struct zpci_dev *zdev = opaque;
u8 status;
int rc;
if (!zdev)
@ -480,13 +479,7 @@ static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm)
*/
zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
rc = zpci_enable_device(zdev);
if (rc)
goto clear_gisa;
/* Re-register the IOMMU that was already created */
rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
virt_to_phys(zdev->dma_table), &status);
rc = zpci_reenable_device(zdev);
if (rc)
goto clear_gisa;
@ -516,7 +509,6 @@ static void kvm_s390_pci_unregister_kvm(void *opaque)
{
struct zpci_dev *zdev = opaque;
struct kvm *kvm;
u8 status;
if (!zdev)
return;
@ -550,12 +542,7 @@ static void kvm_s390_pci_unregister_kvm(void *opaque)
goto out;
}
if (zpci_enable_device(zdev))
goto out;
/* Re-register the IOMMU that was already created */
zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
virt_to_phys(zdev->dma_table), &status);
zpci_reenable_device(zdev);
out:
spin_lock(&kvm->arch.kzdev_list_lock);

View File

@ -134,14 +134,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
struct zpci_fib fib = {0};
u8 cc;
WARN_ON_ONCE(iota & 0x3fff);
fib.pba = base;
/* Work around off by one in ISM virt device */
if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base)
fib.pal = limit + (1 << 12);
else
fib.pal = limit;
fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
fib.iota = iota;
fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, status);
if (cc)
@ -700,6 +699,23 @@ int zpci_enable_device(struct zpci_dev *zdev)
}
EXPORT_SYMBOL_GPL(zpci_enable_device);
int zpci_reenable_device(struct zpci_dev *zdev)
{
u8 status;
int rc;
rc = zpci_enable_device(zdev);
if (rc)
return rc;
rc = zpci_iommu_register_ioat(zdev, &status);
if (rc)
zpci_disable_device(zdev);
return rc;
}
EXPORT_SYMBOL_GPL(zpci_reenable_device);
int zpci_disable_device(struct zpci_dev *zdev)
{
u32 fh = zdev->fh;
@ -749,7 +765,6 @@ EXPORT_SYMBOL_GPL(zpci_disable_device);
*/
int zpci_hot_reset_device(struct zpci_dev *zdev)
{
u8 status;
int rc;
lockdep_assert_held(&zdev->state_lock);
@ -768,19 +783,9 @@ int zpci_hot_reset_device(struct zpci_dev *zdev)
return rc;
}
rc = zpci_enable_device(zdev);
if (rc)
return rc;
rc = zpci_reenable_device(zdev);
if (zdev->dma_table)
rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
virt_to_phys(zdev->dma_table), &status);
if (rc) {
zpci_disable_device(zdev);
return rc;
}
return 0;
}
/**

View File

@ -19,6 +19,7 @@
#include <linux/jump_label.h>
#include <linux/pci.h>
#include <linux/printk.h>
#include <linux/dma-direct.h>
#include <asm/pci_clp.h>
#include <asm/pci_dma.h>
@ -283,10 +284,32 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
return zbus;
}
static void pci_dma_range_setup(struct pci_dev *pdev)
{
struct zpci_dev *zdev = to_zpci(pdev);
u64 aligned_end, size;
dma_addr_t dma_start;
int ret;
dma_start = PAGE_ALIGN(zdev->start_dma);
aligned_end = PAGE_ALIGN_DOWN(zdev->end_dma + 1);
if (aligned_end >= dma_start)
size = aligned_end - dma_start;
else
size = 0;
WARN_ON_ONCE(size == 0);
ret = dma_direct_set_offset(&pdev->dev, 0, dma_start, size);
if (ret)
pr_err("Failed to allocate DMA range map for %s\n", pci_name(pdev));
}
void pcibios_bus_add_device(struct pci_dev *pdev)
{
struct zpci_dev *zdev = to_zpci(pdev);
pci_dma_range_setup(pdev);
/*
* With pdev->no_vf_scan the common PCI probing code does not
* perform PF/VF linking.

View File

@ -108,6 +108,7 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
zdev->version = response->version;
zdev->maxstbl = response->maxstbl;
zdev->dtsm = response->dtsm;
zdev->rtr_avail = response->rtr;
switch (response->version) {
case 1:

View File

@ -52,7 +52,6 @@ static DEVICE_ATTR_RO(mio_enabled);
static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev)
{
u8 status;
int ret;
pci_stop_and_remove_bus_device(pdev);
@ -70,16 +69,8 @@ static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev)
return ret;
}
ret = zpci_enable_device(zdev);
if (ret)
return ret;
ret = zpci_reenable_device(zdev);
if (zdev->dma_table) {
ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
virt_to_phys(zdev->dma_table), &status);
if (ret)
zpci_disable_device(zdev);
}
return ret;
}

View File

@ -16,7 +16,7 @@
#include "dma-iommu.h"
static const struct iommu_ops s390_iommu_ops;
static const struct iommu_ops s390_iommu_ops, s390_iommu_rtr_ops;
static struct kmem_cache *dma_region_table_cache;
static struct kmem_cache *dma_page_table_cache;
@ -31,10 +31,21 @@ struct s390_domain {
unsigned long *dma_table;
spinlock_t list_lock;
struct rcu_head rcu;
u8 origin_type;
};
static struct iommu_domain blocking_domain;
static inline unsigned int calc_rfx(dma_addr_t ptr)
{
return ((unsigned long)ptr >> ZPCI_RF_SHIFT) & ZPCI_INDEX_MASK;
}
static inline unsigned int calc_rsx(dma_addr_t ptr)
{
return ((unsigned long)ptr >> ZPCI_RS_SHIFT) & ZPCI_INDEX_MASK;
}
static inline unsigned int calc_rtx(dma_addr_t ptr)
{
return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
@ -56,6 +67,20 @@ static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa)
*entry |= (pfaa & ZPCI_PTE_ADDR_MASK);
}
static inline void set_rf_rso(unsigned long *entry, phys_addr_t rso)
{
*entry &= ZPCI_RTE_FLAG_MASK;
*entry |= (rso & ZPCI_RTE_ADDR_MASK);
*entry |= ZPCI_TABLE_TYPE_RFX;
}
static inline void set_rs_rto(unsigned long *entry, phys_addr_t rto)
{
*entry &= ZPCI_RTE_FLAG_MASK;
*entry |= (rto & ZPCI_RTE_ADDR_MASK);
*entry |= ZPCI_TABLE_TYPE_RSX;
}
static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto)
{
*entry &= ZPCI_RTE_FLAG_MASK;
@ -70,6 +95,22 @@ static inline void set_st_pto(unsigned long *entry, phys_addr_t pto)
*entry |= ZPCI_TABLE_TYPE_SX;
}
static inline void validate_rf_entry(unsigned long *entry)
{
*entry &= ~ZPCI_TABLE_VALID_MASK;
*entry &= ~ZPCI_TABLE_OFFSET_MASK;
*entry |= ZPCI_TABLE_VALID;
*entry |= ZPCI_TABLE_LEN_RFX;
}
static inline void validate_rs_entry(unsigned long *entry)
{
*entry &= ~ZPCI_TABLE_VALID_MASK;
*entry &= ~ZPCI_TABLE_OFFSET_MASK;
*entry |= ZPCI_TABLE_VALID;
*entry |= ZPCI_TABLE_LEN_RSX;
}
static inline void validate_rt_entry(unsigned long *entry)
{
*entry &= ~ZPCI_TABLE_VALID_MASK;
@ -120,6 +161,22 @@ static inline int pt_entry_isvalid(unsigned long entry)
return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
}
static inline unsigned long *get_rf_rso(unsigned long entry)
{
if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RFX)
return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK);
else
return NULL;
}
static inline unsigned long *get_rs_rto(unsigned long entry)
{
if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RSX)
return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK);
else
return NULL;
}
static inline unsigned long *get_rt_sto(unsigned long entry)
{
if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
@ -191,18 +248,59 @@ static void dma_free_seg_table(unsigned long entry)
dma_free_cpu_table(sto);
}
static void dma_cleanup_tables(unsigned long *table)
static void dma_free_rt_table(unsigned long entry)
{
unsigned long *rto = get_rs_rto(entry);
int rtx;
if (!table)
for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
if (reg_entry_isvalid(rto[rtx]))
dma_free_seg_table(rto[rtx]);
dma_free_cpu_table(rto);
}
static void dma_free_rs_table(unsigned long entry)
{
unsigned long *rso = get_rf_rso(entry);
int rsx;
for (rsx = 0; rsx < ZPCI_TABLE_ENTRIES; rsx++)
if (reg_entry_isvalid(rso[rsx]))
dma_free_rt_table(rso[rsx]);
dma_free_cpu_table(rso);
}
static void dma_cleanup_tables(struct s390_domain *domain)
{
int rtx, rsx, rfx;
if (!domain->dma_table)
return;
switch (domain->origin_type) {
case ZPCI_TABLE_TYPE_RFX:
for (rfx = 0; rfx < ZPCI_TABLE_ENTRIES; rfx++)
if (reg_entry_isvalid(domain->dma_table[rfx]))
dma_free_rs_table(domain->dma_table[rfx]);
break;
case ZPCI_TABLE_TYPE_RSX:
for (rsx = 0; rsx < ZPCI_TABLE_ENTRIES; rsx++)
if (reg_entry_isvalid(domain->dma_table[rsx]))
dma_free_rt_table(domain->dma_table[rsx]);
break;
case ZPCI_TABLE_TYPE_RTX:
for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
if (reg_entry_isvalid(table[rtx]))
dma_free_seg_table(table[rtx]);
if (reg_entry_isvalid(domain->dma_table[rtx]))
dma_free_seg_table(domain->dma_table[rtx]);
break;
default:
WARN_ONCE(1, "Invalid IOMMU table (%x)\n", domain->origin_type);
return;
}
dma_free_cpu_table(table);
dma_free_cpu_table(domain->dma_table);
}
static unsigned long *dma_alloc_page_table(gfp_t gfp)
@ -218,6 +316,70 @@ static unsigned long *dma_alloc_page_table(gfp_t gfp)
return table;
}
static unsigned long *dma_walk_rs_table(unsigned long *rso,
dma_addr_t dma_addr, gfp_t gfp)
{
unsigned int rsx = calc_rsx(dma_addr);
unsigned long old_rse, rse;
unsigned long *rsep, *rto;
rsep = &rso[rsx];
rse = READ_ONCE(*rsep);
if (reg_entry_isvalid(rse)) {
rto = get_rs_rto(rse);
} else {
rto = dma_alloc_cpu_table(gfp);
if (!rto)
return NULL;
set_rs_rto(&rse, virt_to_phys(rto));
validate_rs_entry(&rse);
entry_clr_protected(&rse);
old_rse = cmpxchg(rsep, ZPCI_TABLE_INVALID, rse);
if (old_rse != ZPCI_TABLE_INVALID) {
/* Somone else was faster, use theirs */
dma_free_cpu_table(rto);
rto = get_rs_rto(old_rse);
}
}
return rto;
}
static unsigned long *dma_walk_rf_table(unsigned long *rfo,
dma_addr_t dma_addr, gfp_t gfp)
{
unsigned int rfx = calc_rfx(dma_addr);
unsigned long old_rfe, rfe;
unsigned long *rfep, *rso;
rfep = &rfo[rfx];
rfe = READ_ONCE(*rfep);
if (reg_entry_isvalid(rfe)) {
rso = get_rf_rso(rfe);
} else {
rso = dma_alloc_cpu_table(gfp);
if (!rso)
return NULL;
set_rf_rso(&rfe, virt_to_phys(rso));
validate_rf_entry(&rfe);
entry_clr_protected(&rfe);
old_rfe = cmpxchg(rfep, ZPCI_TABLE_INVALID, rfe);
if (old_rfe != ZPCI_TABLE_INVALID) {
/* Somone else was faster, use theirs */
dma_free_cpu_table(rso);
rso = get_rf_rso(old_rfe);
}
}
if (!rso)
return NULL;
return dma_walk_rs_table(rso, dma_addr, gfp);
}
static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp)
{
unsigned long old_rte, rte;
@ -271,11 +433,31 @@ static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp)
return pto;
}
static unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, gfp_t gfp)
static unsigned long *dma_walk_region_tables(struct s390_domain *domain,
dma_addr_t dma_addr, gfp_t gfp)
{
unsigned long *sto, *pto;
switch (domain->origin_type) {
case ZPCI_TABLE_TYPE_RFX:
return dma_walk_rf_table(domain->dma_table, dma_addr, gfp);
case ZPCI_TABLE_TYPE_RSX:
return dma_walk_rs_table(domain->dma_table, dma_addr, gfp);
case ZPCI_TABLE_TYPE_RTX:
return domain->dma_table;
default:
return NULL;
}
}
static unsigned long *dma_walk_cpu_trans(struct s390_domain *domain,
dma_addr_t dma_addr, gfp_t gfp)
{
unsigned long *rto, *sto, *pto;
unsigned int rtx, sx, px;
rto = dma_walk_region_tables(domain, dma_addr, gfp);
if (!rto)
return NULL;
rtx = calc_rtx(dma_addr);
sto = dma_get_seg_table_origin(&rto[rtx], gfp);
if (!sto)
@ -329,9 +511,25 @@ static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap)
}
}
static inline u64 max_tbl_size(struct s390_domain *domain)
{
switch (domain->origin_type) {
case ZPCI_TABLE_TYPE_RTX:
return ZPCI_TABLE_SIZE_RT - 1;
case ZPCI_TABLE_TYPE_RSX:
return ZPCI_TABLE_SIZE_RS - 1;
case ZPCI_TABLE_TYPE_RFX:
return U64_MAX;
default:
return 0;
}
}
static struct iommu_domain *s390_domain_alloc_paging(struct device *dev)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain *s390_domain;
u64 aperture_size;
s390_domain = kzalloc(sizeof(*s390_domain), GFP_KERNEL);
if (!s390_domain)
@ -342,9 +540,26 @@ static struct iommu_domain *s390_domain_alloc_paging(struct device *dev)
kfree(s390_domain);
return NULL;
}
aperture_size = min(s390_iommu_aperture,
zdev->end_dma - zdev->start_dma + 1);
if (aperture_size <= (ZPCI_TABLE_SIZE_RT - zdev->start_dma)) {
s390_domain->origin_type = ZPCI_TABLE_TYPE_RTX;
} else if (aperture_size <= (ZPCI_TABLE_SIZE_RS - zdev->start_dma) &&
(zdev->dtsm & ZPCI_IOTA_DT_RS)) {
s390_domain->origin_type = ZPCI_TABLE_TYPE_RSX;
} else if (zdev->dtsm & ZPCI_IOTA_DT_RF) {
s390_domain->origin_type = ZPCI_TABLE_TYPE_RFX;
} else {
/* Assume RTX available */
s390_domain->origin_type = ZPCI_TABLE_TYPE_RTX;
aperture_size = ZPCI_TABLE_SIZE_RT - zdev->start_dma;
}
zdev->end_dma = zdev->start_dma + aperture_size - 1;
s390_domain->domain.geometry.force_aperture = true;
s390_domain->domain.geometry.aperture_start = 0;
s390_domain->domain.geometry.aperture_end = ZPCI_TABLE_SIZE_RT - 1;
s390_domain->domain.geometry.aperture_end = max_tbl_size(s390_domain);
spin_lock_init(&s390_domain->list_lock);
INIT_LIST_HEAD_RCU(&s390_domain->devices);
@ -356,7 +571,7 @@ static void s390_iommu_rcu_free_domain(struct rcu_head *head)
{
struct s390_domain *s390_domain = container_of(head, struct s390_domain, rcu);
dma_cleanup_tables(s390_domain->dma_table);
dma_cleanup_tables(s390_domain);
kfree(s390_domain);
}
@ -381,6 +596,61 @@ static void zdev_s390_domain_update(struct zpci_dev *zdev,
spin_unlock_irqrestore(&zdev->dom_lock, flags);
}
static u64 get_iota_region_flag(struct s390_domain *domain)
{
switch (domain->origin_type) {
case ZPCI_TABLE_TYPE_RTX:
return ZPCI_IOTA_RTTO_FLAG;
case ZPCI_TABLE_TYPE_RSX:
return ZPCI_IOTA_RSTO_FLAG;
case ZPCI_TABLE_TYPE_RFX:
return ZPCI_IOTA_RFTO_FLAG;
default:
WARN_ONCE(1, "Invalid IOMMU table (%x)\n", domain->origin_type);
return 0;
}
}
static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev,
struct iommu_domain *domain, u8 *status)
{
struct s390_domain *s390_domain;
int rc = 0;
u64 iota;
switch (domain->type) {
case IOMMU_DOMAIN_IDENTITY:
rc = zpci_register_ioat(zdev, 0, zdev->start_dma,
zdev->end_dma, 0, status);
break;
case IOMMU_DOMAIN_BLOCKED:
/* Nothing to do in this case */
break;
default:
s390_domain = to_s390_domain(domain);
iota = virt_to_phys(s390_domain->dma_table) |
get_iota_region_flag(s390_domain);
rc = zpci_register_ioat(zdev, 0, zdev->start_dma,
zdev->end_dma, iota, status);
}
return rc;
}
int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status)
{
unsigned long flags;
int rc;
spin_lock_irqsave(&zdev->dom_lock, flags);
rc = s390_iommu_domain_reg_ioat(zdev, zdev->s390_domain, status);
spin_unlock_irqrestore(&zdev->dom_lock, flags);
return rc;
}
static int blocking_domain_attach_device(struct iommu_domain *domain,
struct device *dev)
{
@ -392,9 +662,11 @@ static int blocking_domain_attach_device(struct iommu_domain *domain,
return 0;
s390_domain = to_s390_domain(zdev->s390_domain);
if (zdev->dma_table) {
spin_lock_irqsave(&s390_domain->list_lock, flags);
list_del_rcu(&zdev->iommu_list);
spin_unlock_irqrestore(&s390_domain->list_lock, flags);
}
zpci_unregister_ioat(zdev, 0);
zdev->dma_table = NULL;
@ -422,8 +694,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
blocking_domain_attach_device(&blocking_domain, dev);
/* If we fail now DMA remains blocked via blocking domain */
cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
virt_to_phys(s390_domain->dma_table), &status);
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
return -EIO;
zdev->dma_table = s390_domain->dma_table;
@ -441,6 +712,8 @@ static void s390_iommu_get_resv_regions(struct device *dev,
{
struct zpci_dev *zdev = to_zpci_dev(dev);
struct iommu_resv_region *region;
u64 max_size, end_resv;
unsigned long flags;
if (zdev->start_dma) {
region = iommu_alloc_resv_region(0, zdev->start_dma, 0,
@ -450,10 +723,21 @@ static void s390_iommu_get_resv_regions(struct device *dev,
list_add_tail(&region->list, list);
}
if (zdev->end_dma < ZPCI_TABLE_SIZE_RT - 1) {
region = iommu_alloc_resv_region(zdev->end_dma + 1,
ZPCI_TABLE_SIZE_RT - zdev->end_dma - 1,
0, IOMMU_RESV_RESERVED, GFP_KERNEL);
spin_lock_irqsave(&zdev->dom_lock, flags);
if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED ||
zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY) {
spin_unlock_irqrestore(&zdev->dom_lock, flags);
return;
}
max_size = max_tbl_size(to_s390_domain(zdev->s390_domain));
spin_unlock_irqrestore(&zdev->dom_lock, flags);
if (zdev->end_dma < max_size) {
end_resv = max_size - zdev->end_dma;
region = iommu_alloc_resv_region(zdev->end_dma + 1, end_resv,
0, IOMMU_RESV_RESERVED,
GFP_KERNEL);
if (!region)
return;
list_add_tail(&region->list, list);
@ -469,13 +753,9 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev)
zdev = to_zpci_dev(dev);
if (zdev->start_dma > zdev->end_dma ||
zdev->start_dma > ZPCI_TABLE_SIZE_RT - 1)
if (zdev->start_dma > zdev->end_dma)
return ERR_PTR(-EINVAL);
if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1)
zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1;
if (zdev->tlb_refresh)
dev->iommu->shadow_on_flush = 1;
@ -565,8 +845,7 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain,
int rc;
for (i = 0; i < nr_pages; i++) {
entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr,
gfp);
entry = dma_walk_cpu_trans(s390_domain, dma_addr, gfp);
if (unlikely(!entry)) {
rc = -ENOMEM;
goto undo_cpu_trans;
@ -581,8 +860,7 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain,
undo_cpu_trans:
while (i-- > 0) {
dma_addr -= PAGE_SIZE;
entry = dma_walk_cpu_trans(s390_domain->dma_table,
dma_addr, gfp);
entry = dma_walk_cpu_trans(s390_domain, dma_addr, gfp);
if (!entry)
break;
dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID);
@ -599,8 +877,7 @@ static int s390_iommu_invalidate_trans(struct s390_domain *s390_domain,
int rc = 0;
for (i = 0; i < nr_pages; i++) {
entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr,
GFP_ATOMIC);
entry = dma_walk_cpu_trans(s390_domain, dma_addr, GFP_ATOMIC);
if (unlikely(!entry)) {
rc = -EINVAL;
break;
@ -644,6 +921,51 @@ static int s390_iommu_map_pages(struct iommu_domain *domain,
return rc;
}
static unsigned long *get_rso_from_iova(struct s390_domain *domain,
dma_addr_t iova)
{
unsigned long *rfo;
unsigned long rfe;
unsigned int rfx;
switch (domain->origin_type) {
case ZPCI_TABLE_TYPE_RFX:
rfo = domain->dma_table;
rfx = calc_rfx(iova);
rfe = READ_ONCE(rfo[rfx]);
if (!reg_entry_isvalid(rfe))
return NULL;
return get_rf_rso(rfe);
case ZPCI_TABLE_TYPE_RSX:
return domain->dma_table;
default:
return NULL;
}
}
static unsigned long *get_rto_from_iova(struct s390_domain *domain,
dma_addr_t iova)
{
unsigned long *rso;
unsigned long rse;
unsigned int rsx;
switch (domain->origin_type) {
case ZPCI_TABLE_TYPE_RFX:
case ZPCI_TABLE_TYPE_RSX:
rso = get_rso_from_iova(domain, iova);
rsx = calc_rsx(iova);
rse = READ_ONCE(rso[rsx]);
if (!reg_entry_isvalid(rse))
return NULL;
return get_rs_rto(rse);
case ZPCI_TABLE_TYPE_RTX:
return domain->dma_table;
default:
return NULL;
}
}
static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain,
dma_addr_t iova)
{
@ -657,10 +979,13 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain,
iova > domain->geometry.aperture_end)
return 0;
rto = get_rto_from_iova(s390_domain, iova);
if (!rto)
return 0;
rtx = calc_rtx(iova);
sx = calc_sx(iova);
px = calc_px(iova);
rto = s390_domain->dma_table;
rte = READ_ONCE(rto[rtx]);
if (reg_entry_isvalid(rte)) {
@ -715,7 +1040,6 @@ struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev)
int zpci_init_iommu(struct zpci_dev *zdev)
{
u64 aperture_size;
int rc = 0;
rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL,
@ -723,16 +1047,16 @@ int zpci_init_iommu(struct zpci_dev *zdev)
if (rc)
goto out_err;
rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, NULL);
if (zdev->rtr_avail) {
rc = iommu_device_register(&zdev->iommu_dev,
&s390_iommu_rtr_ops, NULL);
} else {
rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops,
NULL);
}
if (rc)
goto out_sysfs;
zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
aperture_size = min3(s390_iommu_aperture,
ZPCI_TABLE_SIZE_RT - zdev->start_dma,
zdev->end_dma - zdev->start_dma + 1);
zdev->end_dma = zdev->start_dma + aperture_size - 1;
return 0;
out_sysfs:
@ -787,6 +1111,39 @@ static int __init s390_iommu_init(void)
}
subsys_initcall(s390_iommu_init);
static int s390_attach_dev_identity(struct iommu_domain *domain,
struct device *dev)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
u8 status;
int cc;
blocking_domain_attach_device(&blocking_domain, dev);
/* If we fail now DMA remains blocked via blocking domain */
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
/*
* If the device is undergoing error recovery the reset code
* will re-establish the new domain.
*/
if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
return -EIO;
zdev_s390_domain_update(zdev, domain);
return 0;
}
static const struct iommu_domain_ops s390_identity_ops = {
.attach_dev = s390_attach_dev_identity,
};
static struct iommu_domain s390_identity_domain = {
.type = IOMMU_DOMAIN_IDENTITY,
.ops = &s390_identity_ops,
};
static struct iommu_domain blocking_domain = {
.type = IOMMU_DOMAIN_BLOCKED,
.ops = &(const struct iommu_domain_ops) {
@ -794,23 +1151,31 @@ static struct iommu_domain blocking_domain = {
}
};
static const struct iommu_ops s390_iommu_ops = {
.blocked_domain = &blocking_domain,
.release_domain = &blocking_domain,
.capable = s390_iommu_capable,
.domain_alloc_paging = s390_domain_alloc_paging,
.probe_device = s390_iommu_probe_device,
.device_group = generic_device_group,
.pgsize_bitmap = SZ_4K,
.get_resv_regions = s390_iommu_get_resv_regions,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = s390_iommu_attach_device,
.map_pages = s390_iommu_map_pages,
.unmap_pages = s390_iommu_unmap_pages,
.flush_iotlb_all = s390_iommu_flush_iotlb_all,
.iotlb_sync = s390_iommu_iotlb_sync,
.iotlb_sync_map = s390_iommu_iotlb_sync_map,
.iova_to_phys = s390_iommu_iova_to_phys,
.free = s390_domain_free,
#define S390_IOMMU_COMMON_OPS() \
.blocked_domain = &blocking_domain, \
.release_domain = &blocking_domain, \
.capable = s390_iommu_capable, \
.domain_alloc_paging = s390_domain_alloc_paging, \
.probe_device = s390_iommu_probe_device, \
.device_group = generic_device_group, \
.pgsize_bitmap = SZ_4K, \
.get_resv_regions = s390_iommu_get_resv_regions, \
.default_domain_ops = &(const struct iommu_domain_ops) { \
.attach_dev = s390_iommu_attach_device, \
.map_pages = s390_iommu_map_pages, \
.unmap_pages = s390_iommu_unmap_pages, \
.flush_iotlb_all = s390_iommu_flush_iotlb_all, \
.iotlb_sync = s390_iommu_iotlb_sync, \
.iotlb_sync_map = s390_iommu_iotlb_sync_map, \
.iova_to_phys = s390_iommu_iova_to_phys, \
.free = s390_domain_free, \
}
static const struct iommu_ops s390_iommu_ops = {
S390_IOMMU_COMMON_OPS()
};
static const struct iommu_ops s390_iommu_rtr_ops = {
.identity_domain = &s390_identity_domain,
S390_IOMMU_COMMON_OPS()
};