diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
| -rw-r--r-- | drivers/infiniband/hw/mlx5/Makefile | 1 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/cmd.c | 21 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/cmd.h | 2 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/data_direct.c | 227 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/data_direct.h | 23 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/devx.c | 2 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/ib_rep.c | 22 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 324 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 60 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 418 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 405 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/std_types.c | 76 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/umr.c | 96 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mlx5/umr.h | 1 |
14 files changed, 1378 insertions, 300 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 72a526236c2e..b38961f5058e 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -6,6 +6,7 @@ mlx5_ib-y := ah.o \ cong.o \ counters.o \ cq.o \ + data_direct.o \ dm.o \ doorbell.o \ gsi.o \ diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 895b62cc528d..7c08e3008927 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -245,3 +245,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid) MLX5_SET(dealloc_uar_in, in, uid, uid); return mlx5_cmd_exec_in(dev, dealloc_uar, in); } + +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid) +{ + u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) + + MLX5_ST_SZ_BYTES(array1024_auto)] = {}; + u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {}; + char *vuid; + int err; + + MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID); + MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id)); + MLX5_SET(query_vuid_in, in, data_direct, data_direct); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid); + memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index e5cd31270443..e6c88b6ebd0d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -58,4 +58,6 @@ int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid); int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid); +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c new file mode 100644 index 000000000000..b9ba84afaae2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "mlx5_ib.h" +#include "data_direct.h" + +static LIST_HEAD(mlx5_data_direct_dev_list); +static LIST_HEAD(mlx5_data_direct_reg_list); + +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_data_direct_mutex); + +struct mlx5_data_direct_registration { + struct mlx5_ib_dev *ibdev; + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1]; + struct list_head list; +}; + +static const struct pci_device_id mlx5_data_direct_pci_table[] = { + { PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */ + { 0, } +}; + +static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + u8 *vpd_data; + int start; + int ret; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data)); + return PTR_ERR(vpd_data); + } + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len); + if (start < 0) { + ret = start; + pci_err(pdev, "VU keyword not found, err=%d\n", ret); + goto end; + } + + dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL); + ret = dev->vuid ? 0 : -ENOMEM; + +end: + kfree(vpd_data); + return ret; +} + +static void mlx5_data_direct_shutdown(struct pci_dev *pdev) +{ + pci_disable_device(pdev); +} + +static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev) +{ + int err; + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err); + return err; + } + } + + dma_set_max_seg_size(&pdev->dev, SZ_2G); + return 0; +} + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid) +{ + struct mlx5_data_direct_registration *reg; + struct mlx5_data_direct_dev *dev; + + reg = kzalloc(sizeof(*reg), GFP_KERNEL); + if (!reg) + return -ENOMEM; + + reg->ibdev = ibdev; + strcpy(reg->vuid, vuid); + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) { + if (strcmp(dev->vuid, vuid) == 0) { + mlx5_ib_data_direct_bind(ibdev, dev); + break; + } + } + + /* Add the registration to its global list, to be used upon bind/unbind + * of its affiliated data direct device + */ + list_add_tail(®->list, &mlx5_data_direct_reg_list); + mutex_unlock(&mlx5_data_direct_mutex); + return 0; +} + +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (reg->ibdev == ibdev) { + list_del(®->list); + kfree(reg); + goto end; + } + } + + WARN_ON(true); +end: + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_bind(reg->ibdev, dev); + } + + /* Add the data direct device to the global list, further IB devices may + * use it later as well + */ + list_add_tail(&dev->list, &mlx5_data_direct_dev_list); + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + /* Prevent any further affiliations */ + list_del(&dev->list); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_unbind(reg->ibdev); + } + mutex_unlock(&mlx5_data_direct_mutex); +} + +static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mlx5_data_direct_dev *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->device = &pdev->dev; + dev->pdev = pdev; + + pci_set_drvdata(dev->pdev, dev); + err = pci_enable_device(pdev); + if (err) { + dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err); + goto err; + } + + pci_set_master(pdev); + err = mlx5_data_direct_set_dma_caps(pdev); + if (err) + goto err_disable; + + if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + dev_dbg(dev->device, "Enabling pci atomics failed\n"); + + err = mlx5_data_direct_vpd_get_vuid(dev); + if (err) + goto err_disable; + + mlx5_data_direct_dev_reg(dev); + return 0; + +err_disable: + pci_disable_device(pdev); +err: + kfree(dev); + return err; +} + +static void mlx5_data_direct_remove(struct pci_dev *pdev) +{ + struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev); + + mlx5_data_direct_dev_unreg(dev); + pci_disable_device(pdev); + kfree(dev->vuid); + kfree(dev); +} + +static struct pci_driver mlx5_data_direct_driver = { + .name = KBUILD_MODNAME, + .id_table = mlx5_data_direct_pci_table, + .probe = mlx5_data_direct_probe, + .remove = mlx5_data_direct_remove, + .shutdown = mlx5_data_direct_shutdown, +}; + +int mlx5_data_direct_driver_register(void) +{ + return pci_register_driver(&mlx5_data_direct_driver); +} + +void mlx5_data_direct_driver_unregister(void) +{ + pci_unregister_driver(&mlx5_data_direct_driver); +} diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h new file mode 100644 index 000000000000..2fd2bdbe8f69 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _MLX5_IB_DATA_DIRECT_H +#define _MLX5_IB_DATA_DIRECT_H + +struct mlx5_ib_dev; + +struct mlx5_data_direct_dev { + struct device *device; + struct pci_dev *pdev; + char *vuid; + struct list_head list; +}; + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid); +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev); +int mlx5_data_direct_driver_register(void); +void mlx5_data_direct_driver_unregister(void); + +#endif diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 253fea374a72..69999d8d24f3 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2673,7 +2673,6 @@ static const struct file_operations devx_async_cmd_event_fops = { .read = devx_async_cmd_event_read, .poll = devx_async_cmd_event_poll, .release = uverbs_uobject_fd_release, - .llseek = no_llseek, }; static ssize_t devx_async_event_read(struct file *filp, char __user *buf, @@ -2788,7 +2787,6 @@ static const struct file_operations devx_async_event_fops = { .read = devx_async_event_read, .poll = devx_async_event_poll, .release = uverbs_uobject_fd_release, - .llseek = no_llseek, }; static void devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj, diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index c7a4ee896121..49af1cfbe6d1 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, int vport_index) { struct mlx5_ib_dev *ibdev; + struct net_device *ndev; ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); if (!ibdev) @@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, ibdev->port[vport_index].rep = rep; rep->rep_data[REP_IB].priv = ibdev; - write_lock(&ibdev->port[vport_index].roce.netdev_lock); - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - write_unlock(&ibdev->port[vport_index].roce.netdev_lock); + ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - return 0; + return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); } static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); @@ -104,10 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) ibdev->is_rep = true; vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); ibdev->mdev = lag_master; ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = num_ports; + ret = ib_device_set_netdev(&ibdev->ib_dev, + mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, + rep->vport), + vport_index + 1); + if (ret) + goto fail_add; ret = __mlx5_ib_add(ibdev, profile); if (ret) @@ -160,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) } port = &dev->port[vport_index]; - write_lock(&port->roce.netdev_lock); - port->roce.netdev = NULL; - write_unlock(&port->roce.netdev_lock); + + ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); rep->rep_data[REP_IB].priv = NULL; port->rep = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6048b9ad13bb..4999239c8f41 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -48,6 +48,7 @@ #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/mlx5_user_ioctl_cmds.h> #include "macsec.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -146,16 +147,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev, if (upper && port->rep->vport == MLX5_VPORT_UPLINK) continue; - - read_lock(&port->roce.netdev_lock); - rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw, - port->rep->vport); - if (rep_ndev == ndev) { - read_unlock(&port->roce.netdev_lock); + rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1); + if (rep_ndev && rep_ndev == ndev) { + dev_put(rep_ndev); *port_num = i + 1; return &port->roce; } - read_unlock(&port->roce.netdev_lock); + + dev_put(rep_ndev); + } + + return NULL; +} + +static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + struct net_device *ib_ndev) +{ + if (!dev->ib_active) + return false; + + /* Event is about our upper device */ + if (upper == ndev) + return true; + + /* RDMA device is not in lag and not in switchdev */ + if (!dev->is_rep && !upper && ndev == ib_ndev) + return true; + + /* RDMA devie is in switchdev */ + if (dev->is_rep && ndev == ib_ndev) + return true; + + return false; +} + +static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < ibdev->num_ports; i++) { + port = &ibdev->port[i]; + if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) { + return ib_device_get_netdev(&ibdev->ib_dev, i + 1); + } } return NULL; @@ -167,6 +204,7 @@ static int mlx5_netdev_event(struct notifier_block *this, struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); u32 port_num = roce->native_port_num; + struct net_device *ib_ndev = NULL; struct mlx5_core_dev *mdev; struct mlx5_ib_dev *ibdev; @@ -180,47 +218,63 @@ static int mlx5_netdev_event(struct notifier_block *this, /* Should already be registered during the load */ if (ibdev->is_rep) break; - write_lock(&roce->netdev_lock); + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + /* Exit if already registered */ + if (ib_ndev) + goto put_ndev; + if (ndev->dev.parent == mdev->device) - roce->netdev = ndev; - write_unlock(&roce->netdev_lock); + ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num); break; case NETDEV_UNREGISTER: /* In case of reps, ib device goes away before the netdevs */ - write_lock(&roce->netdev_lock); - if (roce->netdev == ndev) - roce->netdev = NULL; - write_unlock(&roce->netdev_lock); - break; + if (ibdev->is_rep) + break; + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + if (ib_ndev == ndev) + ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num); + goto put_ndev; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; - if (lag_ndev) { - upper = netdev_master_upper_dev_get(lag_ndev); - dev_put(lag_ndev); + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { + struct net_device *lag_ndev; + + if(mlx5_lag_is_roce(mdev)) + lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1); + else /* sriov lag */ + lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev); + + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } else { + goto done; + } } if (ibdev->is_rep) roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); if (!roce) return NOTIFY_DONE; - if ((upper == ndev || - ((!upper || ibdev->is_rep) && ndev == roce->netdev)) && - ibdev->ib_active) { + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + + if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) { struct ib_event ibev = { }; enum ib_port_state port_state; if (get_port_state(&ibdev->ib_dev, port_num, &port_state)) - goto done; + goto put_ndev; if (roce->last_port_state == port_state) - goto done; + goto put_ndev; roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; @@ -229,7 +283,7 @@ static int mlx5_netdev_event(struct notifier_block *this, else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - goto done; + goto put_ndev; ibev.element.port_num = port_num; ib_dispatch_event(&ibev); @@ -240,38 +294,13 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } +put_ndev: + dev_put(ib_ndev); done: mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } -static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, - u32 port_num) -{ - struct mlx5_ib_dev *ibdev = to_mdev(device); - struct net_device *ndev; - struct mlx5_core_dev *mdev; - - mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); - if (!mdev) - return NULL; - - ndev = mlx5_lag_get_roce_netdev(mdev); - if (ndev) - goto out; - - /* Ensure ndev does not disappear before we invoke dev_hold() - */ - read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); - ndev = ibdev->port[port_num - 1].roce.netdev; - dev_hold(ndev); - read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); - -out: - mlx5_ib_put_native_port_mdev(ibdev, port_num); - return ndev; -} - struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 ib_port_num, u32 *native_port_num) @@ -546,11 +575,11 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, if (!put_mdev) goto out; - ndev = mlx5_ib_get_netdev(device, port_num); + ndev = ib_device_get_netdev(device, port_num); if (!ndev) goto out; - if (dev->lag_active) { + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { rcu_read_lock(); upper = netdev_master_upper_dev_get_rcu(ndev); if (upper) { @@ -3024,6 +3053,59 @@ static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) mutex_destroy(&devr->srq_lock); } +static int +mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + void *mkc; + u32 mkey; + u32 pdn; + u32 *in; + int err; + + err = mlx5_core_alloc_pd(mdev, &pdn); + if (err) + return err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err; + } + + MLX5_SET(create_mkey_in, in, data_direct, 1); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + kvfree(in); + if (err) + goto err; + + dev->ddr.mkey = mkey; + dev->ddr.pdn = pdn; + return 0; + +err: + mlx5_core_dealloc_pd(mdev, pdn); + return err; +} + +static void +mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) +{ + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); + mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); +} + static u32 get_core_cap_flags(struct ib_device *ibdev, struct mlx5_hca_vport_context *rep) { @@ -3124,6 +3206,60 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str) fw_rev_sub(dev->mdev)); } +static int lag_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, + lag_events); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_port *port; + struct net_device *ndev; + int i, err; + int portnum; + + portnum = 0; + switch (event) { + case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: + ndev = data; + if (ndev) { + if (!mlx5_lag_is_roce(mdev)) { + // sriov lag + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (port->rep && port->rep->vport == + MLX5_VPORT_UPLINK) { + portnum = i; + break; + } + } + } + err = ib_device_set_netdev(&dev->ib_dev, ndev, + portnum + 1); + dev_put(ndev); + if (err) + return err; + /* Rescan gids after new netdev assignment */ + rdma_roce_rescan_device(&dev->ib_dev); + } + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) +{ + dev->lag_events.notifier_call = lag_event; + blocking_notifier_chain_register(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + +static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) +{ + blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; @@ -3145,6 +3281,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) goto err_destroy_vport_lag; } + mlx5e_lag_event_register(dev); dev->flow_db->lag_demux_ft = ft; dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; @@ -3162,6 +3299,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) if (dev->lag_active) { dev->lag_active = false; + mlx5e_lag_event_unregister(dev); mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); dev->flow_db->lag_demux_ft = NULL; @@ -3420,6 +3558,41 @@ unbind: return false; } +static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) +{ + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return 0; + + ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid); + if (ret) + return ret; + + ret = mlx5_ib_create_data_direct_resources(dev); + if (ret) + return ret; + + INIT_LIST_HEAD(&dev->data_direct_mr_list); + ret = mlx5_data_direct_ib_reg(dev, vuid); + if (ret) + mlx5_ib_free_data_direct_resources(dev); + + return ret; +} + +static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return; + + mlx5_data_direct_ib_unreg(dev); + mlx5_ib_free_data_direct_resources(dev); +} + static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) { u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; @@ -3796,6 +3969,14 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( dump_fill_mkey), UA_MANDATORY)); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_reg_dmabuf_mr, + UVERBS_OBJECT_MR, + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum mlx5_ib_uapi_reg_dmabuf_flags, + UA_OPTIONAL)); + static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), @@ -3805,6 +3986,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), @@ -3813,6 +3995,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { + mlx5_ib_data_direct_cleanup(dev); mlx5_ib_cleanup_multiport_master(dev); WARN_ON(!xa_empty(&dev->odp_mkeys)); mutex_destroy(&dev->cap_mask_mutex); @@ -3828,13 +4011,11 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.dev.parent = mdev->device; dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; for (i = 0; i < dev->num_ports; i++) { spin_lock_init(&dev->port[i].mp.mpi_lock); - rwlock_init(&dev->port[i].roce.netdev_lock); dev->port[i].roce.dev = dev; dev->port[i].roce.native_port_num = i + 1; dev->port[i].roce.last_port_state = IB_PORT_DOWN; @@ -3866,6 +4047,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev); mutex_init(&dev->cap_mask_mutex); + mutex_init(&dev->data_direct_lock); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); xa_init(&dev->odp_mkeys); @@ -3874,6 +4056,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; + err = mlx5_ib_data_direct_init(dev); + if (err) + goto err_mp; + return 0; err_mp: mlx5_ib_cleanup_multiport_master(dev); @@ -4094,7 +4280,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = { .create_wq = mlx5_ib_create_wq, .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, .destroy_wq = mlx5_ib_destroy_wq, - .get_netdev = mlx5_ib_get_netdev, .modify_wq = mlx5_ib_modify_wq, INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table, @@ -4293,6 +4478,22 @@ static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); } +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev) +{ + mutex_lock(&ibdev->data_direct_lock); + ibdev->data_direct_dev = dev; + mutex_unlock(&ibdev->data_direct_lock); +} + +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) +{ + mutex_lock(&ibdev->data_direct_lock); + mlx5_ib_revoke_data_direct_mrs(ibdev); + ibdev->data_direct_dev = NULL; + mutex_unlock(&ibdev->data_direct_lock); +} + void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, int stage) @@ -4522,6 +4723,7 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, mplane->mdev = mparent->mdev; mplane->num_ports = mparent->num_plane; mplane->sub_dev_name = name; + mplane->ib_dev.phys_port_cnt = mplane->num_ports; ret = __mlx5_ib_add(mplane, &plane_profile); if (ret) @@ -4638,6 +4840,7 @@ static int mlx5r_probe(struct auxiliary_device *adev, dev->mdev = mdev; dev->num_ports = num_ports; + dev->ib_dev.phys_port_cnt = num_ports; if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) profile = &raw_eth_profile; @@ -4715,17 +4918,23 @@ static int __init mlx5_ib_init(void) ret = mlx5r_rep_init(); if (ret) goto rep_err; + ret = mlx5_data_direct_driver_register(); + if (ret) + goto dd_err; ret = auxiliary_driver_register(&mlx5r_mp_driver); if (ret) goto mp_err; ret = auxiliary_driver_register(&mlx5r_driver); if (ret) goto drv_err; + return 0; drv_err: auxiliary_driver_unregister(&mlx5r_mp_driver); mp_err: + mlx5_data_direct_driver_unregister(); +dd_err: mlx5r_rep_cleanup(); rep_err: mlx5_ib_qp_event_cleanup(); @@ -4737,6 +4946,7 @@ qp_event_err: static void __exit mlx5_ib_cleanup(void) { + mlx5_data_direct_driver_unregister(); auxiliary_driver_unregister(&mlx5r_driver); auxiliary_driver_unregister(&mlx5r_mp_driver); mlx5r_rep_cleanup(); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d5eb1b726675..23fd72f7f63d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -63,17 +63,6 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits, return GENMASK(largest_pg_shift, pgsz_shift); } -/* - * For mkc users, instead of a page_offset the command has a start_iova which - * specifies both the page_offset and the on-the-wire IOVA - */ -#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \ - ib_umem_find_best_pgsz(umem, \ - __mlx5_log_page_size_to_bitmap( \ - __mlx5_bit_sz(typ, log_pgsz_fld), \ - pgsz_shift), \ - iova) - static __always_inline unsigned long __mlx5_page_offset_to_bitmask(unsigned int page_offset_bits, unsigned int offset_shift) @@ -640,6 +629,8 @@ enum mlx5_mkey_type { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, MLX5_MKEY_INDIRECT_DEVX, + MLX5_MKEY_NULL, + MLX5_MKEY_IMPLICIT_CHILD, }; struct mlx5r_cache_rb_key { @@ -682,6 +673,8 @@ struct mlx5_ib_mr { struct mlx5_ib_mkey mmkey; struct ib_umem *umem; + /* The mr is data direct related */ + u8 data_direct :1; union { /* Used only by kernel MRs (umem == NULL) */ @@ -719,6 +712,11 @@ struct mlx5_ib_mr { } odp_destroy; struct ib_odp_counters odp_stats; bool is_odp_implicit; + /* The affilated data direct crossed mr */ + struct mlx5_ib_mr *dd_crossed_mr; + struct list_head dd_node; + u8 revoked :1; + struct mlx5_ib_mkey null_mmkey; }; }; }; @@ -796,6 +794,7 @@ struct mlx5_cache_ent { u8 is_tmp:1; u8 disabled:1; u8 fill_to_high_water:1; + u8 tmp_cleanup_scheduled:1; /* * - limit is the low water mark for stored mkeys, 2* limit is the @@ -827,7 +826,6 @@ struct mlx5_mkey_cache { struct mutex rb_lock; struct dentry *fs_root; unsigned long last_add; - struct delayed_work remove_ent_dwork; }; struct mlx5_ib_port_resources { @@ -835,6 +833,11 @@ struct mlx5_ib_port_resources { struct work_struct pkey_change_work; }; +struct mlx5_data_direct_resources { + u32 pdn; + u32 mkey; +}; + struct mlx5_ib_resources { struct ib_cq *c0; struct mutex cq_lock; @@ -885,8 +888,6 @@ struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer */ - rwlock_t netdev_lock; - struct net_device *netdev; struct notifier_block nb; struct netdev_net_notifier nn; struct notifier_block mdev_nb; @@ -1131,7 +1132,11 @@ struct mlx5_macsec { struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_data_direct_dev *data_direct_dev; + /* protect accessing data_direct_dev */ + struct mutex data_direct_lock; struct notifier_block mdev_events; + struct notifier_block lag_events; int num_ports; /* serialize update of capability mask */ @@ -1161,6 +1166,7 @@ struct mlx5_ib_dev { /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; + struct list_head data_direct_mr_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; @@ -1185,6 +1191,7 @@ struct mlx5_ib_dev { u16 pkey_table_len; u8 lag_ports; struct mlx5_special_mkeys mkeys; + struct mlx5_data_direct_resources ddr; #ifdef CONFIG_MLX5_MACSEC struct mlx5_macsec macsec; @@ -1345,7 +1352,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, @@ -1356,7 +1363,6 @@ int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags); -void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, @@ -1425,6 +1431,10 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev); +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev); +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); @@ -1633,8 +1643,6 @@ static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey) wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0); } -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); - static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) { /* @@ -1707,4 +1715,20 @@ static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) return (port - 1) / dev->num_ports + 1; } +/* + * For mkc users, instead of a page_offset the command has a start_iova which + * specifies both the page_offset and the on-the-wire IOVA + */ +static __always_inline unsigned long +mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, + u64 iova) +{ + int page_size_bits = + MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5; + unsigned long bitmap = + __mlx5_log_page_size_to_bitmap(page_size_bits, 0); + + return ib_umem_find_best_pgsz(umem, bitmap, iova); +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 98bd8eaa393e..45d9dc9c6c8f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -43,18 +43,22 @@ #include "dm.h" #include "mlx5_ib.h" #include "umr.h" +#include "data_direct.h" enum { MAX_PENDING_REG_MR = 8, }; +#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 #define MLX5_UMR_ALIGN 2048 static void create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate); + unsigned int page_size, bool populate, + int access_mode); +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, struct ib_pd *pd) @@ -211,9 +215,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&ent->mkeys_queue.lock, flags); push_mkey_locked(ent, mkey_out->mkey); + ent->pending--; /* If we are doing fill_to_high_water then keep going. */ queue_adjust_cache_locked(ent); - ent->pending--; spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); kfree(mkey_out); } @@ -527,6 +531,21 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) } } +static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) +{ + u32 mkey; + + spin_lock_irq(&ent->mkeys_queue.lock); + while (ent->mkeys_queue.ci) { + mkey = pop_mkey_locked(ent); + spin_unlock_irq(&ent->mkeys_queue.lock); + mlx5_core_destroy_mkey(dev->mdev, mkey); + spin_lock_irq(&ent->mkeys_queue.lock); + } + ent->tmp_cleanup_scheduled = false; + spin_unlock_irq(&ent->mkeys_queue.lock); +} + static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; @@ -598,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work) struct mlx5_cache_ent *ent; ent = container_of(work, struct mlx5_cache_ent, dwork.work); - __cache_work_func(ent); + /* temp entries are never filled, only cleaned */ + if (ent->is_tmp) + clean_keys(ent->dev, ent); + else + __cache_work_func(ent); } static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, @@ -659,6 +682,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, { struct rb_node *node = dev->cache.rb_root.rb_node; struct mlx5_cache_ent *cur, *smallest = NULL; + u64 ndescs_limit; int cmp; /* @@ -677,10 +701,18 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, return cur; } + /* + * Limit the usage of mkeys larger than twice the required size while + * also allowing the usage of smallest cache entry for small MRs. + */ + ndescs_limit = max_t(u64, rb_key.ndescs * 2, + MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); + return (smallest && smallest->rb_key.access_mode == rb_key.access_mode && smallest->rb_key.access_flags == rb_key.access_flags && - smallest->rb_key.ats == rb_key.ats) ? + smallest->rb_key.ats == rb_key.ats && + smallest->rb_key.ndescs <= ndescs_limit) ? smallest : NULL; } @@ -765,21 +797,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, return _mlx5_mr_cache_alloc(dev, ent, access_flags); } -static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) -{ - u32 mkey; - - cancel_delayed_work(&ent->dwork); - spin_lock_irq(&ent->mkeys_queue.lock); - while (ent->mkeys_queue.ci) { - mkey = pop_mkey_locked(ent); - spin_unlock_irq(&ent->mkeys_queue.lock); - mlx5_core_destroy_mkey(dev->mdev, mkey); - spin_lock_irq(&ent->mkeys_queue.lock); - } - spin_unlock_irq(&ent->mkeys_queue.lock); -} - static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) { if (!mlx5_debugfs_root || dev->is_rep) @@ -892,10 +909,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, ent->limit = 0; mlx5_mkey_cache_debugfs_add_ent(dev, ent); - } else { - mod_delayed_work(ent->dev->cache.wq, - &ent->dev->cache.remove_ent_dwork, - msecs_to_jiffies(30 * 1000)); } return ent; @@ -906,35 +919,6 @@ mkeys_err: return ERR_PTR(ret); } -static void remove_ent_work_func(struct work_struct *work) -{ - struct mlx5_mkey_cache *cache; - struct mlx5_cache_ent *ent; - struct rb_node *cur; - - cache = container_of(work, struct mlx5_mkey_cache, - remove_ent_dwork.work); - mutex_lock(&cache->rb_lock); - cur = rb_last(&cache->rb_root); - while (cur) { - ent = rb_entry(cur, struct mlx5_cache_ent, node); - cur = rb_prev(cur); - mutex_unlock(&cache->rb_lock); - - spin_lock_irq(&ent->mkeys_queue.lock); - if (!ent->is_tmp) { - spin_unlock_irq(&ent->mkeys_queue.lock); - mutex_lock(&cache->rb_lock); - continue; - } - spin_unlock_irq(&ent->mkeys_queue.lock); - - clean_keys(ent->dev, ent); - mutex_lock(&cache->rb_lock); - } - mutex_unlock(&cache->rb_lock); -} - int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; @@ -950,7 +934,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mutex_init(&dev->slow_path_mutex); mutex_init(&dev->cache.rb_lock); dev->cache.rb_root = RB_ROOT; - INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -962,7 +945,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mlx5_mkey_cache_debugfs_init(dev); mutex_lock(&cache->rb_lock); for (i = 0; i <= mkey_cache_max_order(dev); i++) { - rb_key.ndescs = 1 << (i + 2); + rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); if (IS_ERR(ent)) { ret = PTR_ERR(ent); @@ -1001,7 +984,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) return; mutex_lock(&dev->cache.rb_lock); - cancel_delayed_work(&dev->cache.remove_ent_dwork); for (node = rb_first(root); node; node = rb_next(node)) { ent = rb_entry(node, struct mlx5_cache_ent, node); spin_lock_irq(&ent->mkeys_queue.lock); @@ -1062,6 +1044,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) MLX5_SET(mkc, mkc, length64, 1); set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, pd); + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) @@ -1126,12 +1109,10 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, - int access_flags) + int access_flags, int access_mode) { - struct mlx5r_cache_rb_key rb_key = { - .access_mode = MLX5_MKC_ACCESS_MODE_MTT, - }; struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5r_cache_rb_key rb_key = {}; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; unsigned int page_size; @@ -1139,11 +1120,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); else - page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, - 0, iova); + page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); + rb_key.access_mode = access_mode; rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); @@ -1154,7 +1135,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, */ if (!ent) { mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, false); + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(mr)) return mr; @@ -1175,13 +1156,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } +static struct ib_mr * +reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, + u32 crossed_lkey) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; + struct mlx5_ib_mr *mr; + void *mkc; + int inlen; + u32 *in; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_1; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, crossing_target_vhca_id, + MLX5_CAP_GEN(dev->mdev, vhca_id)); + MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + + /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ + set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); + MLX5_SET64(mkc, mkc, len, iova + length); + + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_2; + + mr->mmkey.type = MLX5_MKEY_MR; + set_mr_fields(dev, mr, length, access_flags, iova); + mr->ibmr.pd = pd; + kvfree(in); + mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); + + return &mr->ibmr; +err_2: + kvfree(in); +err_1: + kfree(mr); + return ERR_PTR(err); +} + /* * If ibmr is NULL it will be allocated by reg_create. * Else, the given ibmr will be used. */ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate) + unsigned int page_size, bool populate, + int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1190,7 +1229,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, int inlen; u32 *in; int err; - bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && + (access_mode == MLX5_MKC_ACCESS_MODE_MTT); + bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); if (!page_size) return ERR_PTR(-EINVAL); @@ -1213,7 +1254,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); if (populate) { - if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { + if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { err = -EINVAL; goto err_2; } @@ -1229,14 +1270,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); set_mkc_access_pd_addr_fields(mkc, access_flags, iova, populate ? pd : dev->umrc.pd); + /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ + if (umem->is_dmabuf && ksm_mode) + MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); + MLX5_SET(mkc, mkc, free, !populate); - MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET64(mkc, mkc, len, umem->length); MLX5_SET(mkc, mkc, bsf_octword_size, 0); - MLX5_SET(mkc, mkc, translations_octword_size, - get_octo_len(iova, umem->length, mr->page_shift)); + if (ksm_mode) + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift) * 2); + else + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift)); MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); if (mlx5_umem_needs_ats(dev, umem, access_flags)) MLX5_SET(mkc, mkc, ma_translation_mode, 1); @@ -1373,13 +1422,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { - mr = alloc_cacheable_mr(pd, umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); } else { - unsigned int page_size = mlx5_umem_find_best_pgsz( - umem, mkc, log_page_size, 0, iova); + unsigned int page_size = + mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, true); + mr = reg_create(pd, umem, iova, access_flags, page_size, + true, MLX5_MKC_ACCESS_MODE_MTT); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { @@ -1442,7 +1493,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(odp)) return ERR_CAST(odp); - mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); if (IS_ERR(mr)) { ib_umem_release(&odp->umem); return ERR_CAST(mr); @@ -1510,35 +1562,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .move_notify = mlx5_ib_dmabuf_invalidate_cb, }; -struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, - u64 length, u64 virt_addr, - int fd, int access_flags, - struct ib_udata *udata) +static struct ib_mr * +reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, + u64 offset, u64 length, u64 virt_addr, + int fd, int access_flags, int access_mode) { + bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem_dmabuf *umem_dmabuf; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - return ERR_PTR(-EOPNOTSUPP); - - mlx5_ib_dbg(dev, - "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", - offset, virt_addr, length, fd, access_flags); - err = mlx5r_umr_resource_init(dev); if (err) return ERR_PTR(err); - /* dmabuf requires xlt update via umr to work. */ - if (!mlx5r_umr_can_load_pas(dev, length)) - return ERR_PTR(-EINVAL); + if (!pinned_mode) + umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, + offset, length, fd, + access_flags, + &mlx5_ib_dmabuf_attach_ops); + else + umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, + dma_device, offset, length, + fd, access_flags); - umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, - access_flags, - &mlx5_ib_dmabuf_attach_ops); if (IS_ERR(umem_dmabuf)) { mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", PTR_ERR(umem_dmabuf)); @@ -1546,7 +1594,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, } mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, - access_flags); + access_flags, access_mode); if (IS_ERR(mr)) { ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); @@ -1556,9 +1604,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); umem_dmabuf->private = mr; - err = mlx5r_store_odp_mkey(dev, &mr->mmkey); - if (err) - goto err_dereg_mr; + if (!pinned_mode) { + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); + if (err) + goto err_dereg_mr; + } else { + mr->data_direct = true; + } err = mlx5_ib_init_dmabuf_mr(mr); if (err) @@ -1566,10 +1618,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return &mr->ibmr; err_dereg_mr: - mlx5_ib_dereg_mr(&mr->ibmr, NULL); + __mlx5_ib_dereg_mr(&mr->ibmr); return ERR_PTR(err); } +static struct ib_mr * +reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_data_direct_dev *data_direct_dev; + struct ib_mr *crossing_mr; + struct ib_mr *crossed_mr; + int ret = 0; + + /* As of HW behaviour the IOVA must be page aligned in KSM mode */ + if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -EINVAL; + goto end; + } + + /* The device's 'data direct mkey' was created without RO flags to + * simplify things and allow for a single mkey per device. + * Since RO is not a must, mask it out accordingly. + */ + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; + crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, + offset, length, virt_addr, fd, + access_flags, MLX5_MKC_ACCESS_MODE_KSM); + if (IS_ERR(crossed_mr)) { + ret = PTR_ERR(crossed_mr); + goto end; + } + + mutex_lock(&dev->slow_path_mutex); + crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, + crossed_mr->lkey); + mutex_unlock(&dev->slow_path_mutex); + if (IS_ERR(crossing_mr)) { + __mlx5_ib_dereg_mr(crossed_mr); + ret = PTR_ERR(crossing_mr); + goto end; + } + + list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); + to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); + to_mmr(crossing_mr)->data_direct = true; +end: + mutex_unlock(&dev->data_direct_lock); + return ret ? ERR_PTR(ret) : crossing_mr; +} + +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int mlx5_access_flags = 0; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return ERR_PTR(-EOPNOTSUPP); + + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { + err = uverbs_get_flags32(&mlx5_access_flags, attrs, + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); + if (err) + return ERR_PTR(err); + } + + mlx5_ib_dbg(dev, + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", + offset, virt_addr, length, fd, access_flags, mlx5_access_flags); + + /* dmabuf requires xlt update via umr to work. */ + if (!mlx5r_umr_can_load_pas(dev, length)) + return ERR_PTR(-EINVAL); + + if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) + return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, + fd, access_flags); + + return reg_user_mr_dmabuf(pd, pd->device->dma_device, + offset, length, virt_addr, + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); +} + /* * True if the change in access flags can be done via UMR, only some access * flags can be updated. @@ -1601,8 +1744,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; - *page_size = - mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); + *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); if (WARN_ON(!*page_size)) return false; return (mr->mmkey.cache_ent->rb_key.ndescs) >= @@ -1665,7 +1807,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct mlx5_ib_mr *mr = to_mmr(ib_mr); int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg( @@ -1793,7 +1935,7 @@ err: static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && mr->descs) { + if (!mr->umem && !mr->data_direct && mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; struct mlx5_ib_dev *dev = to_mdev(device); @@ -1847,13 +1989,51 @@ end: return ret; } +static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + int err; + + lockdep_assert_held(&dev->data_direct_lock); + mr->revoked = true; + err = mlx5r_umr_revoke_mr(mr); + if (WARN_ON(err)) + return err; + + ib_umem_dmabuf_revoke(umem_dmabuf); + return 0; +} + +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_mr *mr, *next; + + lockdep_assert_held(&dev->data_direct_lock); + + list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { + list_del(&mr->dd_node); + mlx5_ib_revoke_data_direct_mr(mr); + } +} + static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; - if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { + ent = mr->mmkey.cache_ent; + /* upon storing to a clean temp entry - schedule its cleanup */ + spin_lock_irq(&ent->mkeys_queue.lock); + if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(30 * 1000)); + ent->tmp_cleanup_scheduled = true; + } + spin_unlock_irq(&ent->mkeys_queue.lock); return 0; + } if (ent) { spin_lock_irq(&ent->mkeys_queue.lock); @@ -1864,7 +2044,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) return destroy_mkey(dev, mr); } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); struct mlx5_ib_dev *dev = to_mdev(ibmr->device); @@ -1931,9 +2111,40 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } +static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; + int ret; + + ret = __mlx5_ib_dereg_mr(&mr->ibmr); + if (ret) + return ret; + + mutex_lock(&dev->data_direct_lock); + if (!dd_crossed_mr->revoked) + list_del(&dd_crossed_mr->dd_node); + + ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); + mutex_unlock(&dev->data_direct_lock); + return ret; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + + if (mr->data_direct) + return dereg_crossing_data_direct_mr(dev, mr); + + return __mlx5_ib_dereg_mr(ibmr); +} + static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, int access_mode, int page_shift) { + struct mlx5_ib_dev *dev = to_mdev(pd->device); void *mkc; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -1946,6 +2157,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, log_page_size, page_shift); + if (access_mode == MLX5_MKC_ACCESS_MODE_PA || + access_mode == MLX5_MKC_ACCESS_MODE_MTT) + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); } static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index a524181f34df..4b37446758fd 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -45,7 +45,7 @@ /* Contains the details of a pagefault. */ struct mlx5_pagefault { u32 bytes_committed; - u32 token; + u64 token; u8 event_subtype; u8 type; union { @@ -74,6 +74,14 @@ struct mlx5_pagefault { u32 rdma_op_len; u64 rdma_va; } rdma; + struct { + u64 va; + u32 mkey; + u32 fault_byte_count; + u32 prefetch_before_byte_count; + u32 prefetch_after_byte_count; + u8 flags; + } memory; }; struct mlx5_ib_pf_eq *eq; @@ -99,13 +107,20 @@ static u64 mlx5_imr_ksm_entries; static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { + struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; struct mlx5_klm *end = pklm + nentries; + int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? + cpu_to_be32(imr->null_mmkey.key) : + mr_to_mdev(imr)->mkeys.null_mkey; + u64 va = + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } return; } @@ -129,7 +144,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -137,8 +152,8 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, pklm->key = cpu_to_be32(mtt->ibmr.lkey); pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } } } @@ -217,6 +232,9 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) return; xa_erase(&imr->implicit_children, idx); + if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -332,46 +350,46 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) else dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && @@ -388,13 +406,29 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? pfault->wqe.wq_num : pfault->token; u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; + void *info; int err; MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); - MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); - MLX5_SET(page_fault_resume_in, in, token, pfault->token); - MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); - MLX5_SET(page_fault_resume_in, in, error, !!error); + + if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.mem_page_fault_info); + MLX5_SET(mem_page_fault_info, info, fault_token_31_0, + pfault->token & 0xffffffff); + MLX5_SET(mem_page_fault_info, info, fault_token_47_32, + (pfault->token >> 32) & 0xffff); + MLX5_SET(mem_page_fault_info, info, error, !!error); + } else { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.trans_page_fault_info); + MLX5_SET(trans_page_fault_info, info, page_fault_type, + pfault->type); + MLX5_SET(trans_page_fault_info, info, fault_token, + pfault->token); + MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); + MLX5_SET(trans_page_fault_info, info, error, !!error); + } err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); if (err) @@ -468,6 +502,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, } xa_unlock(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + xa_erase(&imr->implicit_children, idx); + goto out_mr; + } + mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; + } mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr; @@ -478,6 +522,57 @@ out_mr: return ret; } +/* + * When using memory scheme ODP, implicit MRs can't use the reserved null mkey + * and each implicit MR needs to assign a private null mkey to get the page + * faults on. + * The null mkey is created with the properties to enable getting the page + * fault for every time it is accessed and having all relevant access flags. + */ +static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *imr, + struct mlx5_ib_pd *pd) +{ + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64; + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4); + MLX5_SET(create_mkey_in, in, pg_access, 1); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + + MLX5_SET(mkc, mkc, translations_octword_size, 4); + MLX5_SET(mkc, mkc, log_page_size, 61); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, pd->pdn); + MLX5_SET64(mkc, mkc, start_addr, 0); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen); + if (err) + goto free_in; + + imr->null_mmkey.type = MLX5_MKEY_NULL; + +free_in: + kfree(in); + return err; +} + struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags) { @@ -510,6 +605,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr->is_odp_implicit = true; xa_init(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + err = alloc_implicit_mr_null_mkey(dev, imr, pd); + if (err) + goto out_mr; + + err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey); + if (err) + goto out_mr; + } + err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, MLX5_KSM_PAGE_SHIFT, @@ -544,6 +649,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) xa_erase(&mr->implicit_children, idx); mlx5_ib_dereg_mr(&mtt->ibmr, NULL); } + + if (mr->null_mmkey.key) { + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->null_mmkey.key)); + + mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, + mr->null_mmkey.key); + } } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -693,7 +806,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); u32 xlt_flags = 0; int err; - unsigned int page_size; + unsigned long page_size; if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; @@ -710,7 +823,10 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); + if (mr->data_direct) + err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); + else + err = mlx5r_umr_update_mr_pas(mr, xlt_flags); } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); @@ -733,24 +849,31 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, * >0: Number of pages mapped */ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped, u32 flags, bool permissive_fault) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - if (unlikely(io_virt < mr->ibmr.iova)) + if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault) return -EFAULT; if (mr->umem->is_dmabuf) return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); if (!odp->is_implicit_odp) { + u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova; u64 user_va; - if (check_add_overflow(io_virt - mr->ibmr.iova, - (u64)odp->umem.address, &user_va)) + if (check_add_overflow(offset, (u64)odp->umem.address, + &user_va)) return -EFAULT; - if (unlikely(user_va >= ib_umem_end(odp) || - ib_umem_end(odp) - user_va < bcnt)) + + if (permissive_fault) { + if (user_va < ib_umem_start(odp)) + user_va = ib_umem_start(odp); + if ((user_va + bcnt) > ib_umem_end(odp)) + bcnt = ib_umem_end(odp) - user_va; + } else if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); @@ -797,6 +920,27 @@ static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key) return mmkey->key == key; } +static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_ib_mkey *mmkey; + + xa_lock(&dev->odp_mkeys); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mmkey = ERR_PTR(-ENOENT); + goto out; + } + if (!mkey_is_eq(mmkey, key)) { + mmkey = ERR_PTR(-EFAULT); + goto out; + } + refcount_inc(&mmkey->usecount); +out: + xa_unlock(&dev->odp_mkeys); + + return mmkey; +} + /* * Handle a single data segment in a page-fault WQE or RDMA region. * @@ -824,32 +968,24 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, io_virt += *bytes_committed; bcnt -= *bytes_committed; - next_mr: - xa_lock(&dev->odp_mkeys); - mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); - if (!mmkey) { - xa_unlock(&dev->odp_mkeys); - mlx5_ib_dbg( - dev, - "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - /* - * The user could specify a SGL with multiple lkeys and only - * some of them are ODP. Treat the non-ODP ones as fully - * faulted. - */ - ret = 0; - goto end; - } - refcount_inc(&mmkey->usecount); - xa_unlock(&dev->odp_mkeys); - - if (!mkey_is_eq(mmkey, key)) { - mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); - ret = -EFAULT; + mmkey = find_odp_mkey(dev, key); + if (IS_ERR(mmkey)) { + ret = PTR_ERR(mmkey); + if (ret == -ENOENT) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and + * only some of them are ODP. Treat the non-ODP ones as + * fully faulted. + */ + ret = 0; + } goto end; } @@ -857,7 +993,7 @@ next_mr: case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); if (ret < 0) goto end; @@ -944,7 +1080,7 @@ next_mr: } end: - if (mmkey) + if (!IS_ERR(mmkey)) mlx5r_deref_odp_mkey(mmkey); while (head) { frame = head; @@ -1266,7 +1402,7 @@ read_user: if (ret) mlx5_ib_err( dev, - "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", + "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n", ret, wqe_index, pfault->token); resolve_page_fault: @@ -1325,13 +1461,13 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } else if (ret < 0 || pages_in_range(address, length) > ret) { mlx5_ib_page_fault_resume(dev, pfault, 1); if (ret != -ENOENT) - mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", ret, pfault->token, pfault->type); return; } mlx5_ib_page_fault_resume(dev, pfault, 0); - mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n", pfault->token, pfault->type, prefetch_activated); @@ -1347,12 +1483,80 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, prefetch_len, &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { - mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); } } } +#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7) +static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + u64 prefetch_va = + pfault->memory.va - pfault->memory.prefetch_before_byte_count; + size_t prefetch_size = pfault->memory.prefetch_before_byte_count + + pfault->memory.fault_byte_count + + pfault->memory.prefetch_after_byte_count; + struct mlx5_ib_mkey *mmkey; + struct mlx5_ib_mr *mr, *child_mr; + int ret = 0; + + mmkey = find_odp_mkey(dev, pfault->memory.mkey); + if (IS_ERR(mmkey)) + goto err; + + switch (mmkey->type) { + case MLX5_MKEY_IMPLICIT_CHILD: + child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + mr = child_mr->parent; + break; + case MLX5_MKEY_NULL: + mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey); + break; + default: + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + break; + } + + /* If prefetch fails, handle only demanded page fault */ + ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true); + if (ret < 0) { + ret = pagefault_mr(mr, pfault->memory.va, + pfault->memory.fault_byte_count, NULL, 0, + true); + if (ret < 0) + goto err; + } + + mlx5_update_odp_stats(mr, faults, ret); + mlx5r_deref_odp_mkey(mmkey); + + if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) + mlx5_ib_page_fault_resume(dev, pfault, 0); + + mlx5_ib_dbg( + dev, + "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n", + pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ? + "" : + "without resume cmd", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count); + + return; + +err: + if (!IS_ERR(mmkey)) + mlx5r_deref_odp_mkey(mmkey); + mlx5_ib_page_fault_resume(dev, pfault, 1); + mlx5_ib_dbg( + dev, + "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count, ret); +} + static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { u8 event_subtype = pfault->event_subtype; @@ -1364,6 +1568,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul case MLX5_PFAULT_SUBTYPE_RDMA: mlx5_ib_mr_rdma_pfault_handler(dev, pfault); break; + case MLX5_PFAULT_SUBTYPE_MEMORY: + mlx5_ib_mr_memory_pfault_handler(dev, pfault); + break; default: mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", event_subtype); @@ -1382,6 +1589,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work) mempool_free(pfault, eq->pool); } +#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) { struct mlx5_eqe_page_fault *pf_eqe; @@ -1398,15 +1606,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) pf_eqe = &eqe->data.page_fault; pfault->event_subtype = eqe->sub_type; - pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); - - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", - eqe->sub_type, pfault->bytes_committed); switch (eqe->sub_type) { case MLX5_PFAULT_SUBTYPE_RDMA: /* RDMA based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->rdma.bytes_committed); pfault->type = be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; pfault->token = @@ -1420,10 +1625,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be32_to_cpu(pf_eqe->rdma.rdma_op_len); pfault->rdma.rdma_va = be64_to_cpu(pf_eqe->rdma.rdma_va); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", - pfault->type, pfault->token, - pfault->rdma.r_key); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, + pfault->rdma.r_key); mlx5_ib_dbg(eq->dev, "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", pfault->rdma.rdma_op_len, @@ -1432,6 +1639,8 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) case MLX5_PFAULT_SUBTYPE_WQE: /* WQE based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->wqe.bytes_committed); pfault->type = (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; pfault->token = @@ -1443,11 +1652,47 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be16_to_cpu(pf_eqe->wqe.wqe_index); pfault->wqe.packet_size = be16_to_cpu(pf_eqe->wqe.packet_length); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", - pfault->type, pfault->token, - pfault->wqe.wq_num, - pfault->wqe.wqe_index); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, pfault->wqe.wq_num, + pfault->wqe.wqe_index); + break; + + case MLX5_PFAULT_SUBTYPE_MEMORY: + /* Memory based event */ + pfault->bytes_committed = 0; + pfault->token = + be32_to_cpu(pf_eqe->memory.token31_0) | + ((u64)be16_to_cpu(pf_eqe->memory.token47_32) + << 32); + pfault->memory.va = be64_to_cpu(pf_eqe->memory.va); + pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey); + pfault->memory.fault_byte_count = (be32_to_cpu( + pf_eqe->memory.demand_fault_pages) >> 12) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_before_byte_count = + be16_to_cpu( + pf_eqe->memory.pre_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_after_byte_count = + be16_to_cpu( + pf_eqe->memory.post_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.flags = pf_eqe->memory.flags; + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n", + eqe->sub_type, pfault->token, + pfault->memory.mkey, + pfault->memory.fault_byte_count, + pfault->memory.va, pfault->memory.flags); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n", + pfault->memory.prefetch_before_byte_count, + pfault->memory.prefetch_after_byte_count); break; default: @@ -1710,7 +1955,7 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w) for (i = 0; i < work->num_sge; ++i) { ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, work->frags[i].length, &bytes_mapped, - work->pf_flags); + work->pf_flags, false); if (ret <= 0) continue; mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); @@ -1761,7 +2006,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, if (IS_ERR(mr)) return PTR_ERR(mr); ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, - &bytes_mapped, pf_flags); + &bytes_mapped, pf_flags, false); if (ret < 0) { mlx5r_deref_odp_mkey(&mr->mmkey); return ret; diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bbfcce3bdc84..bdb568411091 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -10,6 +10,7 @@ #include <linux/mlx5/eswitch.h> #include <linux/mlx5/vport.h> #include "mlx5_ib.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -111,6 +112,23 @@ out: return err; } +static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_uapi_query_port *info) +{ + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL); + if (!mdev) + return -EINVAL; + + info->vport_vhca_id = MLX5_CAP_GEN(mdev, vhca_id); + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; + + mlx5_ib_put_native_port_mdev(dev, port_num); + + return 0; +} + static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_uapi_query_port *info) { @@ -177,12 +195,60 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)( ret = fill_switchdev_info(dev, port_num, &info); if (ret) return ret; + } else if (mlx5_core_mp_enabled(dev->mdev)) { + ret = fill_multiport_info(dev, port_num, &info); + if (ret) + return ret; } return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info, sizeof(info)); } +static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_data_direct_dev *data_direct_dev; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); + u32 dev_path_len; + char *dev_path; + int ret; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -ENODEV; + goto end; + } + + dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL); + if (!dev_path) { + ret = -ENOMEM; + goto end; + } + + dev_path_len = strlen(dev_path) + 1; + if (dev_path_len > out_len) { + ret = -ENOSPC; + goto end; + } + + ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, + dev_path_len); + kfree(dev_path); + +end: + mutex_unlock(&dev->data_direct_lock); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_QUERY_PORT, UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM, @@ -193,9 +259,17 @@ DECLARE_UVERBS_NAMED_METHOD( reg_c0), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_MIN_SIZE(0), + UA_MANDATORY)); + ADD_UVERBS_METHODS(mlx5_ib_device, UVERBS_OBJECT_DEVICE, - &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT)); + &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT), + &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)); DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_PD_QUERY, diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index ffc31b01f690..887fd6fa3ba9 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -224,6 +224,9 @@ int mlx5r_umr_init(struct mlx5_ib_dev *dev) void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) { + if (!dev->umrc.pd) + return; + mutex_destroy(&dev->umrc.init_lock); ib_dealloc_pd(dev->umrc.pd); } @@ -632,44 +635,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } -/* - * Send the DMA list to the HW for a normal MR using UMR. - * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP - * flag may be used. - */ -int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +static int +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) { + size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; struct ib_block_iter biter; + struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; - struct mlx5_mtt *mtt; size_t final_size; + void *curr_entry; struct ib_sge sg; + void *entry; u64 offset = 0; int err = 0; - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - mtt = mlx5r_umr_create_xlt( - dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - sizeof(*mtt), flags); - if (!mtt) + entry = mlx5r_umr_create_xlt(dev, &sg, + ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), + ent_size, flags); + if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, mr->page_shift); + if (dd) { + /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + cur_ksm = entry; + } else { + cur_mtt = entry; + } + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); - cur_mtt = mtt; + curr_entry = entry; rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { - if (cur_mtt == (void *)mtt + sg.length) { + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -681,23 +687,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) DMA_TO_DEVICE); offset += sg.length; mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); - - cur_mtt = mtt; + if (dd) + cur_ksm = entry; + else + cur_mtt = entry; } - cur_mtt->ptag = - cpu_to_be64(rdma_block_iter_dma_address(&biter) | - MLX5_IB_MTT_PRESENT); - - if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) - cur_mtt->ptag = 0; - - cur_mtt++; + if (dd) { + cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + cur_ksm++; + curr_entry = cur_ksm; + } else { + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + cur_mtt++; + curr_entry = cur_mtt; + } } - final_size = (void *)cur_mtt - (void *)mtt; + final_size = curr_entry - entry; sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); - memset(cur_mtt, 0, sg.length - final_size); + memset(curr_entry, 0, sg.length - final_size); mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -705,10 +719,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) err: sg.length = orig_sg_length; - mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); + mlx5r_umr_unmap_free_xlt(dev, entry, &sg); return err; } +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + /* No invalidation flow is expected */ + if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, true); +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false); +} + static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 5f734dc72bef..4a02c9b5aad8 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -95,6 +95,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags); int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); |
