aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/mlx5
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r--drivers/infiniband/hw/mlx5/Makefile1
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.c21
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.h2
-rw-r--r--drivers/infiniband/hw/mlx5/data_direct.c227
-rw-r--r--drivers/infiniband/hw/mlx5/data_direct.h23
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c2
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c22
-rw-r--r--drivers/infiniband/hw/mlx5/main.c324
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h60
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c418
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c405
-rw-r--r--drivers/infiniband/hw/mlx5/std_types.c76
-rw-r--r--drivers/infiniband/hw/mlx5/umr.c96
-rw-r--r--drivers/infiniband/hw/mlx5/umr.h1
14 files changed, 1378 insertions, 300 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 72a526236c2e..b38961f5058e 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -6,6 +6,7 @@ mlx5_ib-y := ah.o \
cong.o \
counters.o \
cq.o \
+ data_direct.o \
dm.o \
doorbell.o \
gsi.o \
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 895b62cc528d..7c08e3008927 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -245,3 +245,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid)
MLX5_SET(dealloc_uar_in, in, uid, uid);
return mlx5_cmd_exec_in(dev, dealloc_uar, in);
}
+
+int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct,
+ char *out_vuid)
+{
+ u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) +
+ MLX5_ST_SZ_BYTES(array1024_auto)] = {};
+ u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {};
+ char *vuid;
+ int err;
+
+ MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID);
+ MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id));
+ MLX5_SET(query_vuid_in, in, data_direct, data_direct);
+ err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid);
+ memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto));
+ return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index e5cd31270443..e6c88b6ebd0d 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -58,4 +58,6 @@ int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb,
u16 opmod, u8 port);
int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid);
int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid);
+int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct,
+ char *out_vuid);
#endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c
new file mode 100644
index 000000000000..b9ba84afaae2
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/data_direct.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "mlx5_ib.h"
+#include "data_direct.h"
+
+static LIST_HEAD(mlx5_data_direct_dev_list);
+static LIST_HEAD(mlx5_data_direct_reg_list);
+
+/*
+ * This mutex should be held when accessing either of the above lists
+ */
+static DEFINE_MUTEX(mlx5_data_direct_mutex);
+
+struct mlx5_data_direct_registration {
+ struct mlx5_ib_dev *ibdev;
+ char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1];
+ struct list_head list;
+};
+
+static const struct pci_device_id mlx5_data_direct_pci_table[] = {
+ { PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */
+ { 0, }
+};
+
+static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev)
+{
+ struct pci_dev *pdev = dev->pdev;
+ unsigned int vpd_size, kw_len;
+ u8 *vpd_data;
+ int start;
+ int ret;
+
+ vpd_data = pci_vpd_alloc(pdev, &vpd_size);
+ if (IS_ERR(vpd_data)) {
+ pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data));
+ return PTR_ERR(vpd_data);
+ }
+
+ start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len);
+ if (start < 0) {
+ ret = start;
+ pci_err(pdev, "VU keyword not found, err=%d\n", ret);
+ goto end;
+ }
+
+ dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL);
+ ret = dev->vuid ? 0 : -ENOMEM;
+
+end:
+ kfree(vpd_data);
+ return ret;
+}
+
+static void mlx5_data_direct_shutdown(struct pci_dev *pdev)
+{
+ pci_disable_device(pdev);
+}
+
+static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev)
+{
+ int err;
+
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+ if (err) {
+ dev_warn(&pdev->dev,
+ "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err);
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err) {
+ dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err);
+ return err;
+ }
+ }
+
+ dma_set_max_seg_size(&pdev->dev, SZ_2G);
+ return 0;
+}
+
+int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid)
+{
+ struct mlx5_data_direct_registration *reg;
+ struct mlx5_data_direct_dev *dev;
+
+ reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+ if (!reg)
+ return -ENOMEM;
+
+ reg->ibdev = ibdev;
+ strcpy(reg->vuid, vuid);
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) {
+ if (strcmp(dev->vuid, vuid) == 0) {
+ mlx5_ib_data_direct_bind(ibdev, dev);
+ break;
+ }
+ }
+
+ /* Add the registration to its global list, to be used upon bind/unbind
+ * of its affiliated data direct device
+ */
+ list_add_tail(&reg->list, &mlx5_data_direct_reg_list);
+ mutex_unlock(&mlx5_data_direct_mutex);
+ return 0;
+}
+
+void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev)
+{
+ struct mlx5_data_direct_registration *reg;
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+ if (reg->ibdev == ibdev) {
+ list_del(&reg->list);
+ kfree(reg);
+ goto end;
+ }
+ }
+
+ WARN_ON(true);
+end:
+ mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev)
+{
+ struct mlx5_data_direct_registration *reg;
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+ if (strcmp(dev->vuid, reg->vuid) == 0)
+ mlx5_ib_data_direct_bind(reg->ibdev, dev);
+ }
+
+ /* Add the data direct device to the global list, further IB devices may
+ * use it later as well
+ */
+ list_add_tail(&dev->list, &mlx5_data_direct_dev_list);
+ mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev)
+{
+ struct mlx5_data_direct_registration *reg;
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ /* Prevent any further affiliations */
+ list_del(&dev->list);
+ list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+ if (strcmp(dev->vuid, reg->vuid) == 0)
+ mlx5_ib_data_direct_unbind(reg->ibdev);
+ }
+ mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct mlx5_data_direct_dev *dev;
+ int err;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->device = &pdev->dev;
+ dev->pdev = pdev;
+
+ pci_set_drvdata(dev->pdev, dev);
+ err = pci_enable_device(pdev);
+ if (err) {
+ dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err);
+ goto err;
+ }
+
+ pci_set_master(pdev);
+ err = mlx5_data_direct_set_dma_caps(pdev);
+ if (err)
+ goto err_disable;
+
+ if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) &&
+ pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) &&
+ pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128))
+ dev_dbg(dev->device, "Enabling pci atomics failed\n");
+
+ err = mlx5_data_direct_vpd_get_vuid(dev);
+ if (err)
+ goto err_disable;
+
+ mlx5_data_direct_dev_reg(dev);
+ return 0;
+
+err_disable:
+ pci_disable_device(pdev);
+err:
+ kfree(dev);
+ return err;
+}
+
+static void mlx5_data_direct_remove(struct pci_dev *pdev)
+{
+ struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev);
+
+ mlx5_data_direct_dev_unreg(dev);
+ pci_disable_device(pdev);
+ kfree(dev->vuid);
+ kfree(dev);
+}
+
+static struct pci_driver mlx5_data_direct_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = mlx5_data_direct_pci_table,
+ .probe = mlx5_data_direct_probe,
+ .remove = mlx5_data_direct_remove,
+ .shutdown = mlx5_data_direct_shutdown,
+};
+
+int mlx5_data_direct_driver_register(void)
+{
+ return pci_register_driver(&mlx5_data_direct_driver);
+}
+
+void mlx5_data_direct_driver_unregister(void)
+{
+ pci_unregister_driver(&mlx5_data_direct_driver);
+}
diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h
new file mode 100644
index 000000000000..2fd2bdbe8f69
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/data_direct.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef _MLX5_IB_DATA_DIRECT_H
+#define _MLX5_IB_DATA_DIRECT_H
+
+struct mlx5_ib_dev;
+
+struct mlx5_data_direct_dev {
+ struct device *device;
+ struct pci_dev *pdev;
+ char *vuid;
+ struct list_head list;
+};
+
+int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid);
+void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev);
+int mlx5_data_direct_driver_register(void);
+void mlx5_data_direct_driver_unregister(void);
+
+#endif
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 253fea374a72..69999d8d24f3 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2673,7 +2673,6 @@ static const struct file_operations devx_async_cmd_event_fops = {
.read = devx_async_cmd_event_read,
.poll = devx_async_cmd_event_poll,
.release = uverbs_uobject_fd_release,
- .llseek = no_llseek,
};
static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
@@ -2788,7 +2787,6 @@ static const struct file_operations devx_async_event_fops = {
.read = devx_async_event_read,
.poll = devx_async_event_poll,
.release = uverbs_uobject_fd_release,
- .llseek = no_llseek,
};
static void devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj,
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index c7a4ee896121..49af1cfbe6d1 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
int vport_index)
{
struct mlx5_ib_dev *ibdev;
+ struct net_device *ndev;
ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB);
if (!ibdev)
@@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
ibdev->port[vport_index].rep = rep;
rep->rep_data[REP_IB].priv = ibdev;
- write_lock(&ibdev->port[vport_index].roce.netdev_lock);
- ibdev->port[vport_index].roce.netdev =
- mlx5_ib_get_rep_netdev(rep->esw, rep->vport);
- write_unlock(&ibdev->port[vport_index].roce.netdev_lock);
+ ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport);
- return 0;
+ return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1);
}
static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);
@@ -104,10 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
ibdev->is_rep = true;
vport_index = rep->vport_index;
ibdev->port[vport_index].rep = rep;
- ibdev->port[vport_index].roce.netdev =
- mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport);
ibdev->mdev = lag_master;
ibdev->num_ports = num_ports;
+ ibdev->ib_dev.phys_port_cnt = num_ports;
+ ret = ib_device_set_netdev(&ibdev->ib_dev,
+ mlx5_ib_get_rep_netdev(lag_master->priv.eswitch,
+ rep->vport),
+ vport_index + 1);
+ if (ret)
+ goto fail_add;
ret = __mlx5_ib_add(ibdev, profile);
if (ret)
@@ -160,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
}
port = &dev->port[vport_index];
- write_lock(&port->roce.netdev_lock);
- port->roce.netdev = NULL;
- write_unlock(&port->roce.netdev_lock);
+
+ ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1);
rep->rep_data[REP_IB].priv = NULL;
port->rep = NULL;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 6048b9ad13bb..4999239c8f41 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -48,6 +48,7 @@
#include <rdma/mlx5_user_ioctl_verbs.h>
#include <rdma/mlx5_user_ioctl_cmds.h>
#include "macsec.h"
+#include "data_direct.h"
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
@@ -146,16 +147,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
continue;
-
- read_lock(&port->roce.netdev_lock);
- rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw,
- port->rep->vport);
- if (rep_ndev == ndev) {
- read_unlock(&port->roce.netdev_lock);
+ rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1);
+ if (rep_ndev && rep_ndev == ndev) {
+ dev_put(rep_ndev);
*port_num = i + 1;
return &port->roce;
}
- read_unlock(&port->roce.netdev_lock);
+
+ dev_put(rep_ndev);
+ }
+
+ return NULL;
+}
+
+static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev,
+ struct net_device *ndev,
+ struct net_device *upper,
+ struct net_device *ib_ndev)
+{
+ if (!dev->ib_active)
+ return false;
+
+ /* Event is about our upper device */
+ if (upper == ndev)
+ return true;
+
+ /* RDMA device is not in lag and not in switchdev */
+ if (!dev->is_rep && !upper && ndev == ib_ndev)
+ return true;
+
+ /* RDMA devie is in switchdev */
+ if (dev->is_rep && ndev == ib_ndev)
+ return true;
+
+ return false;
+}
+
+static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev)
+{
+ struct mlx5_ib_port *port;
+ int i;
+
+ for (i = 0; i < ibdev->num_ports; i++) {
+ port = &ibdev->port[i];
+ if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) {
+ return ib_device_get_netdev(&ibdev->ib_dev, i + 1);
+ }
}
return NULL;
@@ -167,6 +204,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
u32 port_num = roce->native_port_num;
+ struct net_device *ib_ndev = NULL;
struct mlx5_core_dev *mdev;
struct mlx5_ib_dev *ibdev;
@@ -180,47 +218,63 @@ static int mlx5_netdev_event(struct notifier_block *this,
/* Should already be registered during the load */
if (ibdev->is_rep)
break;
- write_lock(&roce->netdev_lock);
+
+ ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
+ /* Exit if already registered */
+ if (ib_ndev)
+ goto put_ndev;
+
if (ndev->dev.parent == mdev->device)
- roce->netdev = ndev;
- write_unlock(&roce->netdev_lock);
+ ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num);
break;
case NETDEV_UNREGISTER:
/* In case of reps, ib device goes away before the netdevs */
- write_lock(&roce->netdev_lock);
- if (roce->netdev == ndev)
- roce->netdev = NULL;
- write_unlock(&roce->netdev_lock);
- break;
+ if (ibdev->is_rep)
+ break;
+ ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
+ if (ib_ndev == ndev)
+ ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num);
+ goto put_ndev;
case NETDEV_CHANGE:
case NETDEV_UP:
case NETDEV_DOWN: {
- struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
struct net_device *upper = NULL;
- if (lag_ndev) {
- upper = netdev_master_upper_dev_get(lag_ndev);
- dev_put(lag_ndev);
+ if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
+ struct net_device *lag_ndev;
+
+ if(mlx5_lag_is_roce(mdev))
+ lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1);
+ else /* sriov lag */
+ lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev);
+
+ if (lag_ndev) {
+ upper = netdev_master_upper_dev_get(lag_ndev);
+ dev_put(lag_ndev);
+ } else {
+ goto done;
+ }
}
if (ibdev->is_rep)
roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
if (!roce)
return NOTIFY_DONE;
- if ((upper == ndev ||
- ((!upper || ibdev->is_rep) && ndev == roce->netdev)) &&
- ibdev->ib_active) {
+
+ ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
+
+ if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) {
struct ib_event ibev = { };
enum ib_port_state port_state;
if (get_port_state(&ibdev->ib_dev, port_num,
&port_state))
- goto done;
+ goto put_ndev;
if (roce->last_port_state == port_state)
- goto done;
+ goto put_ndev;
roce->last_port_state = port_state;
ibev.device = &ibdev->ib_dev;
@@ -229,7 +283,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
else if (port_state == IB_PORT_ACTIVE)
ibev.event = IB_EVENT_PORT_ACTIVE;
else
- goto done;
+ goto put_ndev;
ibev.element.port_num = port_num;
ib_dispatch_event(&ibev);
@@ -240,38 +294,13 @@ static int mlx5_netdev_event(struct notifier_block *this,
default:
break;
}
+put_ndev:
+ dev_put(ib_ndev);
done:
mlx5_ib_put_native_port_mdev(ibdev, port_num);
return NOTIFY_DONE;
}
-static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
- u32 port_num)
-{
- struct mlx5_ib_dev *ibdev = to_mdev(device);
- struct net_device *ndev;
- struct mlx5_core_dev *mdev;
-
- mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
- if (!mdev)
- return NULL;
-
- ndev = mlx5_lag_get_roce_netdev(mdev);
- if (ndev)
- goto out;
-
- /* Ensure ndev does not disappear before we invoke dev_hold()
- */
- read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
- ndev = ibdev->port[port_num - 1].roce.netdev;
- dev_hold(ndev);
- read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
-
-out:
- mlx5_ib_put_native_port_mdev(ibdev, port_num);
- return ndev;
-}
-
struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
u32 ib_port_num,
u32 *native_port_num)
@@ -546,11 +575,11 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
if (!put_mdev)
goto out;
- ndev = mlx5_ib_get_netdev(device, port_num);
+ ndev = ib_device_get_netdev(device, port_num);
if (!ndev)
goto out;
- if (dev->lag_active) {
+ if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
rcu_read_lock();
upper = netdev_master_upper_dev_get_rcu(ndev);
if (upper) {
@@ -3024,6 +3053,59 @@ static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
mutex_destroy(&devr->srq_lock);
}
+static int
+mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+ struct mlx5_core_dev *mdev = dev->mdev;
+ void *mkc;
+ u32 mkey;
+ u32 pdn;
+ u32 *in;
+ int err;
+
+ err = mlx5_core_alloc_pd(mdev, &pdn);
+ if (err)
+ return err;
+
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ MLX5_SET(create_mkey_in, in, data_direct, 1);
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, lr, 1);
+ MLX5_SET(mkc, mkc, rw, 1);
+ MLX5_SET(mkc, mkc, rr, 1);
+ MLX5_SET(mkc, mkc, a, 1);
+ MLX5_SET(mkc, mkc, pd, pdn);
+ MLX5_SET(mkc, mkc, length64, 1);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+ err = mlx5_core_create_mkey(mdev, &mkey, in, inlen);
+ kvfree(in);
+ if (err)
+ goto err;
+
+ dev->ddr.mkey = mkey;
+ dev->ddr.pdn = pdn;
+ return 0;
+
+err:
+ mlx5_core_dealloc_pd(mdev, pdn);
+ return err;
+}
+
+static void
+mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev)
+{
+ mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey);
+ mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn);
+}
+
static u32 get_core_cap_flags(struct ib_device *ibdev,
struct mlx5_hca_vport_context *rep)
{
@@ -3124,6 +3206,60 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str)
fw_rev_sub(dev->mdev));
}
+static int lag_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev,
+ lag_events);
+ struct mlx5_core_dev *mdev = dev->mdev;
+ struct mlx5_ib_port *port;
+ struct net_device *ndev;
+ int i, err;
+ int portnum;
+
+ portnum = 0;
+ switch (event) {
+ case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE:
+ ndev = data;
+ if (ndev) {
+ if (!mlx5_lag_is_roce(mdev)) {
+ // sriov lag
+ for (i = 0; i < dev->num_ports; i++) {
+ port = &dev->port[i];
+ if (port->rep && port->rep->vport ==
+ MLX5_VPORT_UPLINK) {
+ portnum = i;
+ break;
+ }
+ }
+ }
+ err = ib_device_set_netdev(&dev->ib_dev, ndev,
+ portnum + 1);
+ dev_put(ndev);
+ if (err)
+ return err;
+ /* Rescan gids after new netdev assignment */
+ rdma_roce_rescan_device(&dev->ib_dev);
+ }
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+ return NOTIFY_OK;
+}
+
+static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev)
+{
+ dev->lag_events.notifier_call = lag_event;
+ blocking_notifier_chain_register(&dev->mdev->priv.lag_nh,
+ &dev->lag_events);
+}
+
+static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev)
+{
+ blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh,
+ &dev->lag_events);
+}
+
static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
@@ -3145,6 +3281,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
goto err_destroy_vport_lag;
}
+ mlx5e_lag_event_register(dev);
dev->flow_db->lag_demux_ft = ft;
dev->lag_ports = mlx5_lag_get_num_ports(mdev);
dev->lag_active = true;
@@ -3162,6 +3299,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
if (dev->lag_active) {
dev->lag_active = false;
+ mlx5e_lag_event_unregister(dev);
mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
dev->flow_db->lag_demux_ft = NULL;
@@ -3420,6 +3558,41 @@ unbind:
return false;
}
+static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev)
+{
+ char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {};
+ int ret;
+
+ if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
+ !MLX5_CAP_GEN_2(dev->mdev, query_vuid))
+ return 0;
+
+ ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid);
+ if (ret)
+ return ret;
+
+ ret = mlx5_ib_create_data_direct_resources(dev);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&dev->data_direct_mr_list);
+ ret = mlx5_data_direct_ib_reg(dev, vuid);
+ if (ret)
+ mlx5_ib_free_data_direct_resources(dev);
+
+ return ret;
+}
+
+static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev)
+{
+ if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
+ !MLX5_CAP_GEN_2(dev->mdev, query_vuid))
+ return;
+
+ mlx5_data_direct_ib_unreg(dev);
+ mlx5_ib_free_data_direct_resources(dev);
+}
+
static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
{
u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
@@ -3796,6 +3969,14 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE(
dump_fill_mkey),
UA_MANDATORY));
+ADD_UVERBS_ATTRIBUTES_SIMPLE(
+ mlx5_ib_reg_dmabuf_mr,
+ UVERBS_OBJECT_MR,
+ UVERBS_METHOD_REG_DMABUF_MR,
+ UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ enum mlx5_ib_uapi_reg_dmabuf_flags,
+ UA_OPTIONAL));
+
static const struct uapi_definition mlx5_ib_defs[] = {
UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
@@ -3805,6 +3986,7 @@ static const struct uapi_definition mlx5_ib_defs[] = {
UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs),
UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
+ UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr),
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
@@ -3813,6 +3995,7 @@ static const struct uapi_definition mlx5_ib_defs[] = {
static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{
+ mlx5_ib_data_direct_cleanup(dev);
mlx5_ib_cleanup_multiport_master(dev);
WARN_ON(!xa_empty(&dev->odp_mkeys));
mutex_destroy(&dev->cap_mask_mutex);
@@ -3828,13 +4011,11 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->ib_dev.node_type = RDMA_NODE_IB_CA;
dev->ib_dev.local_dma_lkey = 0 /* not supported for now */;
- dev->ib_dev.phys_port_cnt = dev->num_ports;
dev->ib_dev.dev.parent = mdev->device;
dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES;
for (i = 0; i < dev->num_ports; i++) {
spin_lock_init(&dev->port[i].mp.mpi_lock);
- rwlock_init(&dev->port[i].roce.netdev_lock);
dev->port[i].roce.dev = dev;
dev->port[i].roce.native_port_num = i + 1;
dev->port[i].roce.last_port_state = IB_PORT_DOWN;
@@ -3866,6 +4047,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev);
mutex_init(&dev->cap_mask_mutex);
+ mutex_init(&dev->data_direct_lock);
INIT_LIST_HEAD(&dev->qp_list);
spin_lock_init(&dev->reset_flow_resource_lock);
xa_init(&dev->odp_mkeys);
@@ -3874,6 +4056,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
spin_lock_init(&dev->dm.lock);
dev->dm.dev = mdev;
+ err = mlx5_ib_data_direct_init(dev);
+ if (err)
+ goto err_mp;
+
return 0;
err_mp:
mlx5_ib_cleanup_multiport_master(dev);
@@ -4094,7 +4280,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
.create_wq = mlx5_ib_create_wq,
.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
.destroy_wq = mlx5_ib_destroy_wq,
- .get_netdev = mlx5_ib_get_netdev,
.modify_wq = mlx5_ib_modify_wq,
INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,
@@ -4293,6 +4478,22 @@ static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
}
+void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
+ struct mlx5_data_direct_dev *dev)
+{
+ mutex_lock(&ibdev->data_direct_lock);
+ ibdev->data_direct_dev = dev;
+ mutex_unlock(&ibdev->data_direct_lock);
+}
+
+void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev)
+{
+ mutex_lock(&ibdev->data_direct_lock);
+ mlx5_ib_revoke_data_direct_mrs(ibdev);
+ ibdev->data_direct_dev = NULL;
+ mutex_unlock(&ibdev->data_direct_lock);
+}
+
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile,
int stage)
@@ -4522,6 +4723,7 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
mplane->mdev = mparent->mdev;
mplane->num_ports = mparent->num_plane;
mplane->sub_dev_name = name;
+ mplane->ib_dev.phys_port_cnt = mplane->num_ports;
ret = __mlx5_ib_add(mplane, &plane_profile);
if (ret)
@@ -4638,6 +4840,7 @@ static int mlx5r_probe(struct auxiliary_device *adev,
dev->mdev = mdev;
dev->num_ports = num_ports;
+ dev->ib_dev.phys_port_cnt = num_ports;
if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev))
profile = &raw_eth_profile;
@@ -4715,17 +4918,23 @@ static int __init mlx5_ib_init(void)
ret = mlx5r_rep_init();
if (ret)
goto rep_err;
+ ret = mlx5_data_direct_driver_register();
+ if (ret)
+ goto dd_err;
ret = auxiliary_driver_register(&mlx5r_mp_driver);
if (ret)
goto mp_err;
ret = auxiliary_driver_register(&mlx5r_driver);
if (ret)
goto drv_err;
+
return 0;
drv_err:
auxiliary_driver_unregister(&mlx5r_mp_driver);
mp_err:
+ mlx5_data_direct_driver_unregister();
+dd_err:
mlx5r_rep_cleanup();
rep_err:
mlx5_ib_qp_event_cleanup();
@@ -4737,6 +4946,7 @@ qp_event_err:
static void __exit mlx5_ib_cleanup(void)
{
+ mlx5_data_direct_driver_unregister();
auxiliary_driver_unregister(&mlx5r_driver);
auxiliary_driver_unregister(&mlx5r_mp_driver);
mlx5r_rep_cleanup();
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d5eb1b726675..23fd72f7f63d 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -63,17 +63,6 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
return GENMASK(largest_pg_shift, pgsz_shift);
}
-/*
- * For mkc users, instead of a page_offset the command has a start_iova which
- * specifies both the page_offset and the on-the-wire IOVA
- */
-#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \
- ib_umem_find_best_pgsz(umem, \
- __mlx5_log_page_size_to_bitmap( \
- __mlx5_bit_sz(typ, log_pgsz_fld), \
- pgsz_shift), \
- iova)
-
static __always_inline unsigned long
__mlx5_page_offset_to_bitmask(unsigned int page_offset_bits,
unsigned int offset_shift)
@@ -640,6 +629,8 @@ enum mlx5_mkey_type {
MLX5_MKEY_MR = 1,
MLX5_MKEY_MW,
MLX5_MKEY_INDIRECT_DEVX,
+ MLX5_MKEY_NULL,
+ MLX5_MKEY_IMPLICIT_CHILD,
};
struct mlx5r_cache_rb_key {
@@ -682,6 +673,8 @@ struct mlx5_ib_mr {
struct mlx5_ib_mkey mmkey;
struct ib_umem *umem;
+ /* The mr is data direct related */
+ u8 data_direct :1;
union {
/* Used only by kernel MRs (umem == NULL) */
@@ -719,6 +712,11 @@ struct mlx5_ib_mr {
} odp_destroy;
struct ib_odp_counters odp_stats;
bool is_odp_implicit;
+ /* The affilated data direct crossed mr */
+ struct mlx5_ib_mr *dd_crossed_mr;
+ struct list_head dd_node;
+ u8 revoked :1;
+ struct mlx5_ib_mkey null_mmkey;
};
};
};
@@ -796,6 +794,7 @@ struct mlx5_cache_ent {
u8 is_tmp:1;
u8 disabled:1;
u8 fill_to_high_water:1;
+ u8 tmp_cleanup_scheduled:1;
/*
* - limit is the low water mark for stored mkeys, 2* limit is the
@@ -827,7 +826,6 @@ struct mlx5_mkey_cache {
struct mutex rb_lock;
struct dentry *fs_root;
unsigned long last_add;
- struct delayed_work remove_ent_dwork;
};
struct mlx5_ib_port_resources {
@@ -835,6 +833,11 @@ struct mlx5_ib_port_resources {
struct work_struct pkey_change_work;
};
+struct mlx5_data_direct_resources {
+ u32 pdn;
+ u32 mkey;
+};
+
struct mlx5_ib_resources {
struct ib_cq *c0;
struct mutex cq_lock;
@@ -885,8 +888,6 @@ struct mlx5_roce {
/* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
* netdev pointer
*/
- rwlock_t netdev_lock;
- struct net_device *netdev;
struct notifier_block nb;
struct netdev_net_notifier nn;
struct notifier_block mdev_nb;
@@ -1131,7 +1132,11 @@ struct mlx5_macsec {
struct mlx5_ib_dev {
struct ib_device ib_dev;
struct mlx5_core_dev *mdev;
+ struct mlx5_data_direct_dev *data_direct_dev;
+ /* protect accessing data_direct_dev */
+ struct mutex data_direct_lock;
struct notifier_block mdev_events;
+ struct notifier_block lag_events;
int num_ports;
/* serialize update of capability mask
*/
@@ -1161,6 +1166,7 @@ struct mlx5_ib_dev {
/* protect resources needed as part of reset flow */
spinlock_t reset_flow_resource_lock;
struct list_head qp_list;
+ struct list_head data_direct_mr_list;
/* Array with num_ports elements */
struct mlx5_ib_port *port;
struct mlx5_sq_bfreg bfreg;
@@ -1185,6 +1191,7 @@ struct mlx5_ib_dev {
u16 pkey_table_len;
u8 lag_ports;
struct mlx5_special_mkeys mkeys;
+ struct mlx5_data_direct_resources ddr;
#ifdef CONFIG_MLX5_MACSEC
struct mlx5_macsec macsec;
@@ -1345,7 +1352,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
u64 length, u64 virt_addr,
int fd, int access_flags,
- struct ib_udata *udata);
+ struct uverbs_attr_bundle *attrs);
int mlx5_ib_advise_mr(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
u32 flags,
@@ -1356,7 +1363,6 @@ int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
int access_flags);
-void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr);
struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
u64 length, u64 virt_addr, int access_flags,
@@ -1425,6 +1431,10 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
struct ib_dm_mr_attr *attr,
struct uverbs_attr_bundle *attrs);
+void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
+ struct mlx5_data_direct_dev *dev);
+void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
@@ -1633,8 +1643,6 @@ static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey)
wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0);
}
-int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
-
static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev)
{
/*
@@ -1707,4 +1715,20 @@ static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port)
return (port - 1) / dev->num_ports + 1;
}
+/*
+ * For mkc users, instead of a page_offset the command has a start_iova which
+ * specifies both the page_offset and the on-the-wire IOVA
+ */
+static __always_inline unsigned long
+mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+ u64 iova)
+{
+ int page_size_bits =
+ MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5;
+ unsigned long bitmap =
+ __mlx5_log_page_size_to_bitmap(page_size_bits, 0);
+
+ return ib_umem_find_best_pgsz(umem, bitmap, iova);
+}
+
#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 98bd8eaa393e..45d9dc9c6c8f 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -43,18 +43,22 @@
#include "dm.h"
#include "mlx5_ib.h"
#include "umr.h"
+#include "data_direct.h"
enum {
MAX_PENDING_REG_MR = 8,
};
+#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
#define MLX5_UMR_ALIGN 2048
static void
create_mkey_callback(int status, struct mlx5_async_work *context);
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
- unsigned int page_size, bool populate);
+ unsigned int page_size, bool populate,
+ int access_mode);
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
struct ib_pd *pd)
@@ -211,9 +215,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
push_mkey_locked(ent, mkey_out->mkey);
+ ent->pending--;
/* If we are doing fill_to_high_water then keep going. */
queue_adjust_cache_locked(ent);
- ent->pending--;
spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
kfree(mkey_out);
}
@@ -527,6 +531,21 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
}
}
+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
+{
+ u32 mkey;
+
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ while (ent->mkeys_queue.ci) {
+ mkey = pop_mkey_locked(ent);
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ mlx5_core_destroy_mkey(dev->mdev, mkey);
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ }
+ ent->tmp_cleanup_scheduled = false;
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+}
+
static void __cache_work_func(struct mlx5_cache_ent *ent)
{
struct mlx5_ib_dev *dev = ent->dev;
@@ -598,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work)
struct mlx5_cache_ent *ent;
ent = container_of(work, struct mlx5_cache_ent, dwork.work);
- __cache_work_func(ent);
+ /* temp entries are never filled, only cleaned */
+ if (ent->is_tmp)
+ clean_keys(ent->dev, ent);
+ else
+ __cache_work_func(ent);
}
static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
@@ -659,6 +682,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
{
struct rb_node *node = dev->cache.rb_root.rb_node;
struct mlx5_cache_ent *cur, *smallest = NULL;
+ u64 ndescs_limit;
int cmp;
/*
@@ -677,10 +701,18 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
return cur;
}
+ /*
+ * Limit the usage of mkeys larger than twice the required size while
+ * also allowing the usage of smallest cache entry for small MRs.
+ */
+ ndescs_limit = max_t(u64, rb_key.ndescs * 2,
+ MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
+
return (smallest &&
smallest->rb_key.access_mode == rb_key.access_mode &&
smallest->rb_key.access_flags == rb_key.access_flags &&
- smallest->rb_key.ats == rb_key.ats) ?
+ smallest->rb_key.ats == rb_key.ats &&
+ smallest->rb_key.ndescs <= ndescs_limit) ?
smallest :
NULL;
}
@@ -765,21 +797,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
return _mlx5_mr_cache_alloc(dev, ent, access_flags);
}
-static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
-{
- u32 mkey;
-
- cancel_delayed_work(&ent->dwork);
- spin_lock_irq(&ent->mkeys_queue.lock);
- while (ent->mkeys_queue.ci) {
- mkey = pop_mkey_locked(ent);
- spin_unlock_irq(&ent->mkeys_queue.lock);
- mlx5_core_destroy_mkey(dev->mdev, mkey);
- spin_lock_irq(&ent->mkeys_queue.lock);
- }
- spin_unlock_irq(&ent->mkeys_queue.lock);
-}
-
static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
{
if (!mlx5_debugfs_root || dev->is_rep)
@@ -892,10 +909,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
ent->limit = 0;
mlx5_mkey_cache_debugfs_add_ent(dev, ent);
- } else {
- mod_delayed_work(ent->dev->cache.wq,
- &ent->dev->cache.remove_ent_dwork,
- msecs_to_jiffies(30 * 1000));
}
return ent;
@@ -906,35 +919,6 @@ mkeys_err:
return ERR_PTR(ret);
}
-static void remove_ent_work_func(struct work_struct *work)
-{
- struct mlx5_mkey_cache *cache;
- struct mlx5_cache_ent *ent;
- struct rb_node *cur;
-
- cache = container_of(work, struct mlx5_mkey_cache,
- remove_ent_dwork.work);
- mutex_lock(&cache->rb_lock);
- cur = rb_last(&cache->rb_root);
- while (cur) {
- ent = rb_entry(cur, struct mlx5_cache_ent, node);
- cur = rb_prev(cur);
- mutex_unlock(&cache->rb_lock);
-
- spin_lock_irq(&ent->mkeys_queue.lock);
- if (!ent->is_tmp) {
- spin_unlock_irq(&ent->mkeys_queue.lock);
- mutex_lock(&cache->rb_lock);
- continue;
- }
- spin_unlock_irq(&ent->mkeys_queue.lock);
-
- clean_keys(ent->dev, ent);
- mutex_lock(&cache->rb_lock);
- }
- mutex_unlock(&cache->rb_lock);
-}
-
int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
{
struct mlx5_mkey_cache *cache = &dev->cache;
@@ -950,7 +934,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
mutex_init(&dev->slow_path_mutex);
mutex_init(&dev->cache.rb_lock);
dev->cache.rb_root = RB_ROOT;
- INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
if (!cache->wq) {
mlx5_ib_warn(dev, "failed to create work queue\n");
@@ -962,7 +945,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
mlx5_mkey_cache_debugfs_init(dev);
mutex_lock(&cache->rb_lock);
for (i = 0; i <= mkey_cache_max_order(dev); i++) {
- rb_key.ndescs = 1 << (i + 2);
+ rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
if (IS_ERR(ent)) {
ret = PTR_ERR(ent);
@@ -1001,7 +984,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
return;
mutex_lock(&dev->cache.rb_lock);
- cancel_delayed_work(&dev->cache.remove_ent_dwork);
for (node = rb_first(root); node; node = rb_next(node)) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
spin_lock_irq(&ent->mkeys_queue.lock);
@@ -1062,6 +1044,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
MLX5_SET(mkc, mkc, length64, 1);
set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
pd);
+ MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
if (err)
@@ -1126,12 +1109,10 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
struct ib_umem *umem, u64 iova,
- int access_flags)
+ int access_flags, int access_mode)
{
- struct mlx5r_cache_rb_key rb_key = {
- .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
- };
struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5r_cache_rb_key rb_key = {};
struct mlx5_cache_ent *ent;
struct mlx5_ib_mr *mr;
unsigned int page_size;
@@ -1139,11 +1120,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
if (umem->is_dmabuf)
page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
else
- page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
- 0, iova);
+ page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
if (WARN_ON(!page_size))
return ERR_PTR(-EINVAL);
+ rb_key.access_mode = access_mode;
rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
@@ -1154,7 +1135,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
*/
if (!ent) {
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, false);
+ mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
mutex_unlock(&dev->slow_path_mutex);
if (IS_ERR(mr))
return mr;
@@ -1175,13 +1156,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
return mr;
}
+static struct ib_mr *
+reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
+ u32 crossed_lkey)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
+ struct mlx5_ib_mr *mr;
+ void *mkc;
+ int inlen;
+ u32 *in;
+ int err;
+
+ if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_1;
+ }
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, crossing_target_vhca_id,
+ MLX5_CAP_GEN(dev->mdev, vhca_id));
+ MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
+ MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+ MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
+
+ /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
+ set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
+ MLX5_SET64(mkc, mkc, len, iova + length);
+
+ MLX5_SET(mkc, mkc, free, 0);
+ MLX5_SET(mkc, mkc, umr_en, 0);
+ err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
+ if (err)
+ goto err_2;
+
+ mr->mmkey.type = MLX5_MKEY_MR;
+ set_mr_fields(dev, mr, length, access_flags, iova);
+ mr->ibmr.pd = pd;
+ kvfree(in);
+ mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
+
+ return &mr->ibmr;
+err_2:
+ kvfree(in);
+err_1:
+ kfree(mr);
+ return ERR_PTR(err);
+}
+
/*
* If ibmr is NULL it will be allocated by reg_create.
* Else, the given ibmr will be used.
*/
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
- unsigned int page_size, bool populate)
+ unsigned int page_size, bool populate,
+ int access_mode)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr;
@@ -1190,7 +1229,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
int inlen;
u32 *in;
int err;
- bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
+ bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
+ (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
+ bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
if (!page_size)
return ERR_PTR(-EINVAL);
@@ -1213,7 +1254,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
}
pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
if (populate) {
- if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
+ if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
err = -EINVAL;
goto err_2;
}
@@ -1229,14 +1270,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
populate ? pd : dev->umrc.pd);
+ /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
+ if (umem->is_dmabuf && ksm_mode)
+ MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
+
MLX5_SET(mkc, mkc, free, !populate);
- MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET64(mkc, mkc, len, umem->length);
MLX5_SET(mkc, mkc, bsf_octword_size, 0);
- MLX5_SET(mkc, mkc, translations_octword_size,
- get_octo_len(iova, umem->length, mr->page_shift));
+ if (ksm_mode)
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_octo_len(iova, umem->length, mr->page_shift) * 2);
+ else
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_octo_len(iova, umem->length, mr->page_shift));
MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
if (mlx5_umem_needs_ats(dev, umem, access_flags))
MLX5_SET(mkc, mkc, ma_translation_mode, 1);
@@ -1373,13 +1422,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
if (xlt_with_umr) {
- mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
+ mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
+ MLX5_MKC_ACCESS_MODE_MTT);
} else {
- unsigned int page_size = mlx5_umem_find_best_pgsz(
- umem, mkc, log_page_size, 0, iova);
+ unsigned int page_size =
+ mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, true);
+ mr = reg_create(pd, umem, iova, access_flags, page_size,
+ true, MLX5_MKC_ACCESS_MODE_MTT);
mutex_unlock(&dev->slow_path_mutex);
}
if (IS_ERR(mr)) {
@@ -1442,7 +1493,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
if (IS_ERR(odp))
return ERR_CAST(odp);
- mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
+ mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
+ MLX5_MKC_ACCESS_MODE_MTT);
if (IS_ERR(mr)) {
ib_umem_release(&odp->umem);
return ERR_CAST(mr);
@@ -1510,35 +1562,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
.move_notify = mlx5_ib_dmabuf_invalidate_cb,
};
-struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
- u64 length, u64 virt_addr,
- int fd, int access_flags,
- struct ib_udata *udata)
+static struct ib_mr *
+reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
+ u64 offset, u64 length, u64 virt_addr,
+ int fd, int access_flags, int access_mode)
{
+ bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr = NULL;
struct ib_umem_dmabuf *umem_dmabuf;
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
- !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
- return ERR_PTR(-EOPNOTSUPP);
-
- mlx5_ib_dbg(dev,
- "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
- offset, virt_addr, length, fd, access_flags);
-
err = mlx5r_umr_resource_init(dev);
if (err)
return ERR_PTR(err);
- /* dmabuf requires xlt update via umr to work. */
- if (!mlx5r_umr_can_load_pas(dev, length))
- return ERR_PTR(-EINVAL);
+ if (!pinned_mode)
+ umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
+ offset, length, fd,
+ access_flags,
+ &mlx5_ib_dmabuf_attach_ops);
+ else
+ umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
+ dma_device, offset, length,
+ fd, access_flags);
- umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
- access_flags,
- &mlx5_ib_dmabuf_attach_ops);
if (IS_ERR(umem_dmabuf)) {
mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
PTR_ERR(umem_dmabuf));
@@ -1546,7 +1594,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
}
mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
- access_flags);
+ access_flags, access_mode);
if (IS_ERR(mr)) {
ib_umem_release(&umem_dmabuf->umem);
return ERR_CAST(mr);
@@ -1556,9 +1604,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
umem_dmabuf->private = mr;
- err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
- if (err)
- goto err_dereg_mr;
+ if (!pinned_mode) {
+ err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
+ if (err)
+ goto err_dereg_mr;
+ } else {
+ mr->data_direct = true;
+ }
err = mlx5_ib_init_dmabuf_mr(mr);
if (err)
@@ -1566,10 +1618,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
return &mr->ibmr;
err_dereg_mr:
- mlx5_ib_dereg_mr(&mr->ibmr, NULL);
+ __mlx5_ib_dereg_mr(&mr->ibmr);
return ERR_PTR(err);
}
+static struct ib_mr *
+reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_data_direct_dev *data_direct_dev;
+ struct ib_mr *crossing_mr;
+ struct ib_mr *crossed_mr;
+ int ret = 0;
+
+ /* As of HW behaviour the IOVA must be page aligned in KSM mode */
+ if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mutex_lock(&dev->data_direct_lock);
+ data_direct_dev = dev->data_direct_dev;
+ if (!data_direct_dev) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* The device's 'data direct mkey' was created without RO flags to
+ * simplify things and allow for a single mkey per device.
+ * Since RO is not a must, mask it out accordingly.
+ */
+ access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
+ crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
+ offset, length, virt_addr, fd,
+ access_flags, MLX5_MKC_ACCESS_MODE_KSM);
+ if (IS_ERR(crossed_mr)) {
+ ret = PTR_ERR(crossed_mr);
+ goto end;
+ }
+
+ mutex_lock(&dev->slow_path_mutex);
+ crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
+ crossed_mr->lkey);
+ mutex_unlock(&dev->slow_path_mutex);
+ if (IS_ERR(crossing_mr)) {
+ __mlx5_ib_dereg_mr(crossed_mr);
+ ret = PTR_ERR(crossing_mr);
+ goto end;
+ }
+
+ list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
+ to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
+ to_mmr(crossing_mr)->data_direct = true;
+end:
+ mutex_unlock(&dev->data_direct_lock);
+ return ret ? ERR_PTR(ret) : crossing_mr;
+}
+
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ int mlx5_access_flags = 0;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
+ !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
+ err = uverbs_get_flags32(&mlx5_access_flags, attrs,
+ MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ mlx5_ib_dbg(dev,
+ "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
+ offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
+
+ /* dmabuf requires xlt update via umr to work. */
+ if (!mlx5r_umr_can_load_pas(dev, length))
+ return ERR_PTR(-EINVAL);
+
+ if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
+ return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
+ fd, access_flags);
+
+ return reg_user_mr_dmabuf(pd, pd->device->dma_device,
+ offset, length, virt_addr,
+ fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
+}
+
/*
* True if the change in access flags can be done via UMR, only some access
* flags can be updated.
@@ -1601,8 +1744,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
return false;
- *page_size =
- mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
+ *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
if (WARN_ON(!*page_size))
return false;
return (mr->mmkey.cache_ent->rb_key.ndescs) >=
@@ -1665,7 +1807,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
struct mlx5_ib_mr *mr = to_mmr(ib_mr);
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
return ERR_PTR(-EOPNOTSUPP);
mlx5_ib_dbg(
@@ -1793,7 +1935,7 @@ err:
static void
mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
{
- if (!mr->umem && mr->descs) {
+ if (!mr->umem && !mr->data_direct && mr->descs) {
struct ib_device *device = mr->ibmr.device;
int size = mr->max_descs * mr->desc_size;
struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1847,13 +1989,51 @@ end:
return ret;
}
+static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+ struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+ int err;
+
+ lockdep_assert_held(&dev->data_direct_lock);
+ mr->revoked = true;
+ err = mlx5r_umr_revoke_mr(mr);
+ if (WARN_ON(err))
+ return err;
+
+ ib_umem_dmabuf_revoke(umem_dmabuf);
+ return 0;
+}
+
+void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_ib_mr *mr, *next;
+
+ lockdep_assert_held(&dev->data_direct_lock);
+
+ list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
+ list_del(&mr->dd_node);
+ mlx5_ib_revoke_data_direct_mr(mr);
+ }
+}
+
static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
{
struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
- if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr))
+ if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
+ ent = mr->mmkey.cache_ent;
+ /* upon storing to a clean temp entry - schedule its cleanup */
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
+ mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
+ msecs_to_jiffies(30 * 1000));
+ ent->tmp_cleanup_scheduled = true;
+ }
+ spin_unlock_irq(&ent->mkeys_queue.lock);
return 0;
+ }
if (ent) {
spin_lock_irq(&ent->mkeys_queue.lock);
@@ -1864,7 +2044,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
return destroy_mkey(dev, mr);
}
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx5_ib_mr *mr = to_mmr(ibmr);
struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
@@ -1931,9 +2111,40 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
return 0;
}
+static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
+ int ret;
+
+ ret = __mlx5_ib_dereg_mr(&mr->ibmr);
+ if (ret)
+ return ret;
+
+ mutex_lock(&dev->data_direct_lock);
+ if (!dd_crossed_mr->revoked)
+ list_del(&dd_crossed_mr->dd_node);
+
+ ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
+ mutex_unlock(&dev->data_direct_lock);
+ return ret;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
+ struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+
+ if (mr->data_direct)
+ return dereg_crossing_data_direct_mr(dev, mr);
+
+ return __mlx5_ib_dereg_mr(ibmr);
+}
+
static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
int access_mode, int page_shift)
{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
void *mkc;
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
@@ -1946,6 +2157,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET(mkc, mkc, log_page_size, page_shift);
+ if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
+ access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+ MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
}
static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index a524181f34df..4b37446758fd 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -45,7 +45,7 @@
/* Contains the details of a pagefault. */
struct mlx5_pagefault {
u32 bytes_committed;
- u32 token;
+ u64 token;
u8 event_subtype;
u8 type;
union {
@@ -74,6 +74,14 @@ struct mlx5_pagefault {
u32 rdma_op_len;
u64 rdma_va;
} rdma;
+ struct {
+ u64 va;
+ u32 mkey;
+ u32 fault_byte_count;
+ u32 prefetch_before_byte_count;
+ u32 prefetch_after_byte_count;
+ u8 flags;
+ } memory;
};
struct mlx5_ib_pf_eq *eq;
@@ -99,13 +107,20 @@ static u64 mlx5_imr_ksm_entries;
static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
struct mlx5_ib_mr *imr, int flags)
{
+ struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev;
struct mlx5_klm *end = pklm + nentries;
+ int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0;
+ __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ?
+ cpu_to_be32(imr->null_mmkey.key) :
+ mr_to_mdev(imr)->mkeys.null_mkey;
+ u64 va =
+ MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0;
if (flags & MLX5_IB_UPD_XLT_ZAP) {
- for (; pklm != end; pklm++, idx++) {
+ for (; pklm != end; pklm++, idx++, va += step) {
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
- pklm->va = 0;
+ pklm->key = key;
+ pklm->va = cpu_to_be64(va);
}
return;
}
@@ -129,7 +144,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
*/
lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
- for (; pklm != end; pklm++, idx++) {
+ for (; pklm != end; pklm++, idx++, va += step) {
struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
@@ -137,8 +152,8 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
pklm->key = cpu_to_be32(mtt->ibmr.lkey);
pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
} else {
- pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
- pklm->va = 0;
+ pklm->key = key;
+ pklm->va = cpu_to_be64(va);
}
}
}
@@ -217,6 +232,9 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
return;
xa_erase(&imr->implicit_children, idx);
+ if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
+ xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key));
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -332,46 +350,46 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
else
dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
- if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send))
caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
- if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive))
caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
@@ -388,13 +406,29 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
pfault->wqe.wq_num : pfault->token;
u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
+ void *info;
int err;
MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
- MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
- MLX5_SET(page_fault_resume_in, in, token, pfault->token);
- MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
- MLX5_SET(page_fault_resume_in, in, error, !!error);
+
+ if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) {
+ info = MLX5_ADDR_OF(page_fault_resume_in, in,
+ page_fault_info.mem_page_fault_info);
+ MLX5_SET(mem_page_fault_info, info, fault_token_31_0,
+ pfault->token & 0xffffffff);
+ MLX5_SET(mem_page_fault_info, info, fault_token_47_32,
+ (pfault->token >> 32) & 0xffff);
+ MLX5_SET(mem_page_fault_info, info, error, !!error);
+ } else {
+ info = MLX5_ADDR_OF(page_fault_resume_in, in,
+ page_fault_info.trans_page_fault_info);
+ MLX5_SET(trans_page_fault_info, info, page_fault_type,
+ pfault->type);
+ MLX5_SET(trans_page_fault_info, info, fault_token,
+ pfault->token);
+ MLX5_SET(trans_page_fault_info, info, wq_number, wq_num);
+ MLX5_SET(trans_page_fault_info, info, error, !!error);
+ }
err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
if (err)
@@ -468,6 +502,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
}
xa_unlock(&imr->implicit_children);
+ if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
+ ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+ &mr->mmkey, GFP_KERNEL);
+ if (xa_is_err(ret)) {
+ ret = ERR_PTR(xa_err(ret));
+ xa_erase(&imr->implicit_children, idx);
+ goto out_mr;
+ }
+ mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
+ }
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
@@ -478,6 +522,57 @@ out_mr:
return ret;
}
+/*
+ * When using memory scheme ODP, implicit MRs can't use the reserved null mkey
+ * and each implicit MR needs to assign a private null mkey to get the page
+ * faults on.
+ * The null mkey is created with the properties to enable getting the page
+ * fault for every time it is accessed and having all relevant access flags.
+ */
+static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_mr *imr,
+ struct mlx5_ib_pd *pd)
+{
+ size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64;
+ void *mkc;
+ u32 *in;
+ int err;
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4);
+ MLX5_SET(create_mkey_in, in, pg_access, 1);
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, a, 1);
+ MLX5_SET(mkc, mkc, rw, 1);
+ MLX5_SET(mkc, mkc, rr, 1);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, lr, 1);
+ MLX5_SET(mkc, mkc, free, 0);
+ MLX5_SET(mkc, mkc, umr_en, 0);
+ MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+
+ MLX5_SET(mkc, mkc, translations_octword_size, 4);
+ MLX5_SET(mkc, mkc, log_page_size, 61);
+ MLX5_SET(mkc, mkc, length64, 1);
+ MLX5_SET(mkc, mkc, pd, pd->pdn);
+ MLX5_SET64(mkc, mkc, start_addr, 0);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+ err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen);
+ if (err)
+ goto free_in;
+
+ imr->null_mmkey.type = MLX5_MKEY_NULL;
+
+free_in:
+ kfree(in);
+ return err;
+}
+
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
int access_flags)
{
@@ -510,6 +605,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
imr->is_odp_implicit = true;
xa_init(&imr->implicit_children);
+ if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
+ err = alloc_implicit_mr_null_mkey(dev, imr, pd);
+ if (err)
+ goto out_mr;
+
+ err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey);
+ if (err)
+ goto out_mr;
+ }
+
err = mlx5r_umr_update_xlt(imr, 0,
mlx5_imr_ksm_entries,
MLX5_KSM_PAGE_SHIFT,
@@ -544,6 +649,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
xa_erase(&mr->implicit_children, idx);
mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
}
+
+ if (mr->null_mmkey.key) {
+ xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->null_mmkey.key));
+
+ mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev,
+ mr->null_mmkey.key);
+ }
}
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
@@ -693,7 +806,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
u32 xlt_flags = 0;
int err;
- unsigned int page_size;
+ unsigned long page_size;
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
@@ -710,7 +823,10 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
ib_umem_dmabuf_unmap_pages(umem_dmabuf);
err = -EINVAL;
} else {
- err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
+ if (mr->data_direct)
+ err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags);
+ else
+ err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
}
dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
@@ -733,24 +849,31 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
* >0: Number of pages mapped
*/
static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
- u32 *bytes_mapped, u32 flags)
+ u32 *bytes_mapped, u32 flags, bool permissive_fault)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- if (unlikely(io_virt < mr->ibmr.iova))
+ if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault)
return -EFAULT;
if (mr->umem->is_dmabuf)
return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
if (!odp->is_implicit_odp) {
+ u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova;
u64 user_va;
- if (check_add_overflow(io_virt - mr->ibmr.iova,
- (u64)odp->umem.address, &user_va))
+ if (check_add_overflow(offset, (u64)odp->umem.address,
+ &user_va))
return -EFAULT;
- if (unlikely(user_va >= ib_umem_end(odp) ||
- ib_umem_end(odp) - user_va < bcnt))
+
+ if (permissive_fault) {
+ if (user_va < ib_umem_start(odp))
+ user_va = ib_umem_start(odp);
+ if ((user_va + bcnt) > ib_umem_end(odp))
+ bcnt = ib_umem_end(odp) - user_va;
+ } else if (unlikely(user_va >= ib_umem_end(odp) ||
+ ib_umem_end(odp) - user_va < bcnt))
return -EFAULT;
return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
flags);
@@ -797,6 +920,27 @@ static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key)
return mmkey->key == key;
}
+static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key)
+{
+ struct mlx5_ib_mkey *mmkey;
+
+ xa_lock(&dev->odp_mkeys);
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
+ if (!mmkey) {
+ mmkey = ERR_PTR(-ENOENT);
+ goto out;
+ }
+ if (!mkey_is_eq(mmkey, key)) {
+ mmkey = ERR_PTR(-EFAULT);
+ goto out;
+ }
+ refcount_inc(&mmkey->usecount);
+out:
+ xa_unlock(&dev->odp_mkeys);
+
+ return mmkey;
+}
+
/*
* Handle a single data segment in a page-fault WQE or RDMA region.
*
@@ -824,32 +968,24 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
-
next_mr:
- xa_lock(&dev->odp_mkeys);
- mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
- if (!mmkey) {
- xa_unlock(&dev->odp_mkeys);
- mlx5_ib_dbg(
- dev,
- "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
- key);
- if (bytes_mapped)
- *bytes_mapped += bcnt;
- /*
- * The user could specify a SGL with multiple lkeys and only
- * some of them are ODP. Treat the non-ODP ones as fully
- * faulted.
- */
- ret = 0;
- goto end;
- }
- refcount_inc(&mmkey->usecount);
- xa_unlock(&dev->odp_mkeys);
-
- if (!mkey_is_eq(mmkey, key)) {
- mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
- ret = -EFAULT;
+ mmkey = find_odp_mkey(dev, key);
+ if (IS_ERR(mmkey)) {
+ ret = PTR_ERR(mmkey);
+ if (ret == -ENOENT) {
+ mlx5_ib_dbg(
+ dev,
+ "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+ key);
+ if (bytes_mapped)
+ *bytes_mapped += bcnt;
+ /*
+ * The user could specify a SGL with multiple lkeys and
+ * only some of them are ODP. Treat the non-ODP ones as
+ * fully faulted.
+ */
+ ret = 0;
+ }
goto end;
}
@@ -857,7 +993,7 @@ next_mr:
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
+ ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false);
if (ret < 0)
goto end;
@@ -944,7 +1080,7 @@ next_mr:
}
end:
- if (mmkey)
+ if (!IS_ERR(mmkey))
mlx5r_deref_odp_mkey(mmkey);
while (head) {
frame = head;
@@ -1266,7 +1402,7 @@ read_user:
if (ret)
mlx5_ib_err(
dev,
- "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
+ "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n",
ret, wqe_index, pfault->token);
resolve_page_fault:
@@ -1325,13 +1461,13 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
} else if (ret < 0 || pages_in_range(address, length) > ret) {
mlx5_ib_page_fault_resume(dev, pfault, 1);
if (ret != -ENOENT)
- mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
+ mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n",
ret, pfault->token, pfault->type);
return;
}
mlx5_ib_page_fault_resume(dev, pfault, 0);
- mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
+ mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n",
pfault->token, pfault->type,
prefetch_activated);
@@ -1347,12 +1483,80 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
prefetch_len,
&bytes_committed, NULL);
if (ret < 0 && ret != -EAGAIN) {
- mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
+ mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n",
ret, pfault->token, address, prefetch_len);
}
}
}
+#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7)
+static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev,
+ struct mlx5_pagefault *pfault)
+{
+ u64 prefetch_va =
+ pfault->memory.va - pfault->memory.prefetch_before_byte_count;
+ size_t prefetch_size = pfault->memory.prefetch_before_byte_count +
+ pfault->memory.fault_byte_count +
+ pfault->memory.prefetch_after_byte_count;
+ struct mlx5_ib_mkey *mmkey;
+ struct mlx5_ib_mr *mr, *child_mr;
+ int ret = 0;
+
+ mmkey = find_odp_mkey(dev, pfault->memory.mkey);
+ if (IS_ERR(mmkey))
+ goto err;
+
+ switch (mmkey->type) {
+ case MLX5_MKEY_IMPLICIT_CHILD:
+ child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+ mr = child_mr->parent;
+ break;
+ case MLX5_MKEY_NULL:
+ mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey);
+ break;
+ default:
+ mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+ break;
+ }
+
+ /* If prefetch fails, handle only demanded page fault */
+ ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true);
+ if (ret < 0) {
+ ret = pagefault_mr(mr, pfault->memory.va,
+ pfault->memory.fault_byte_count, NULL, 0,
+ true);
+ if (ret < 0)
+ goto err;
+ }
+
+ mlx5_update_odp_stats(mr, faults, ret);
+ mlx5r_deref_odp_mkey(mmkey);
+
+ if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST)
+ mlx5_ib_page_fault_resume(dev, pfault, 0);
+
+ mlx5_ib_dbg(
+ dev,
+ "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n",
+ pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ?
+ "" :
+ "without resume cmd",
+ pfault->token, pfault->memory.mkey, pfault->memory.va,
+ pfault->memory.fault_byte_count);
+
+ return;
+
+err:
+ if (!IS_ERR(mmkey))
+ mlx5r_deref_odp_mkey(mmkey);
+ mlx5_ib_page_fault_resume(dev, pfault, 1);
+ mlx5_ib_dbg(
+ dev,
+ "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n",
+ pfault->token, pfault->memory.mkey, pfault->memory.va,
+ pfault->memory.fault_byte_count, ret);
+}
+
static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
{
u8 event_subtype = pfault->event_subtype;
@@ -1364,6 +1568,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul
case MLX5_PFAULT_SUBTYPE_RDMA:
mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
break;
+ case MLX5_PFAULT_SUBTYPE_MEMORY:
+ mlx5_ib_mr_memory_pfault_handler(dev, pfault);
+ break;
default:
mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
event_subtype);
@@ -1382,6 +1589,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work)
mempool_free(pfault, eq->pool);
}
+#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096
static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
{
struct mlx5_eqe_page_fault *pf_eqe;
@@ -1398,15 +1606,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
pf_eqe = &eqe->data.page_fault;
pfault->event_subtype = eqe->sub_type;
- pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
-
- mlx5_ib_dbg(eq->dev,
- "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
- eqe->sub_type, pfault->bytes_committed);
switch (eqe->sub_type) {
case MLX5_PFAULT_SUBTYPE_RDMA:
/* RDMA based event */
+ pfault->bytes_committed =
+ be32_to_cpu(pf_eqe->rdma.bytes_committed);
pfault->type =
be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
pfault->token =
@@ -1420,10 +1625,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
be32_to_cpu(pf_eqe->rdma.rdma_op_len);
pfault->rdma.rdma_va =
be64_to_cpu(pf_eqe->rdma.rdma_va);
- mlx5_ib_dbg(eq->dev,
- "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
- pfault->type, pfault->token,
- pfault->rdma.r_key);
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n",
+ eqe->sub_type, pfault->bytes_committed,
+ pfault->type, pfault->token,
+ pfault->rdma.r_key);
mlx5_ib_dbg(eq->dev,
"PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
pfault->rdma.rdma_op_len,
@@ -1432,6 +1639,8 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
case MLX5_PFAULT_SUBTYPE_WQE:
/* WQE based event */
+ pfault->bytes_committed =
+ be32_to_cpu(pf_eqe->wqe.bytes_committed);
pfault->type =
(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
pfault->token =
@@ -1443,11 +1652,47 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
be16_to_cpu(pf_eqe->wqe.wqe_index);
pfault->wqe.packet_size =
be16_to_cpu(pf_eqe->wqe.packet_length);
- mlx5_ib_dbg(eq->dev,
- "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
- pfault->type, pfault->token,
- pfault->wqe.wq_num,
- pfault->wqe.wqe_index);
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+ eqe->sub_type, pfault->bytes_committed,
+ pfault->type, pfault->token, pfault->wqe.wq_num,
+ pfault->wqe.wqe_index);
+ break;
+
+ case MLX5_PFAULT_SUBTYPE_MEMORY:
+ /* Memory based event */
+ pfault->bytes_committed = 0;
+ pfault->token =
+ be32_to_cpu(pf_eqe->memory.token31_0) |
+ ((u64)be16_to_cpu(pf_eqe->memory.token47_32)
+ << 32);
+ pfault->memory.va = be64_to_cpu(pf_eqe->memory.va);
+ pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey);
+ pfault->memory.fault_byte_count = (be32_to_cpu(
+ pf_eqe->memory.demand_fault_pages) >> 12) *
+ MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+ pfault->memory.prefetch_before_byte_count =
+ be16_to_cpu(
+ pf_eqe->memory.pre_demand_fault_pages) *
+ MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+ pfault->memory.prefetch_after_byte_count =
+ be16_to_cpu(
+ pf_eqe->memory.post_demand_fault_pages) *
+ MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+ pfault->memory.flags = pf_eqe->memory.flags;
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n",
+ eqe->sub_type, pfault->token,
+ pfault->memory.mkey,
+ pfault->memory.fault_byte_count,
+ pfault->memory.va, pfault->memory.flags);
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n",
+ pfault->memory.prefetch_before_byte_count,
+ pfault->memory.prefetch_after_byte_count);
break;
default:
@@ -1710,7 +1955,7 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
for (i = 0; i < work->num_sge; ++i) {
ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
work->frags[i].length, &bytes_mapped,
- work->pf_flags);
+ work->pf_flags, false);
if (ret <= 0)
continue;
mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
@@ -1761,7 +2006,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
if (IS_ERR(mr))
return PTR_ERR(mr);
ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
- &bytes_mapped, pf_flags);
+ &bytes_mapped, pf_flags, false);
if (ret < 0) {
mlx5r_deref_odp_mkey(&mr->mmkey);
return ret;
diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c
index bbfcce3bdc84..bdb568411091 100644
--- a/drivers/infiniband/hw/mlx5/std_types.c
+++ b/drivers/infiniband/hw/mlx5/std_types.c
@@ -10,6 +10,7 @@
#include <linux/mlx5/eswitch.h>
#include <linux/mlx5/vport.h>
#include "mlx5_ib.h"
+#include "data_direct.h"
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
@@ -111,6 +112,23 @@ out:
return err;
}
+static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num,
+ struct mlx5_ib_uapi_query_port *info)
+{
+ struct mlx5_core_dev *mdev;
+
+ mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL);
+ if (!mdev)
+ return -EINVAL;
+
+ info->vport_vhca_id = MLX5_CAP_GEN(mdev, vhca_id);
+ info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID;
+
+ mlx5_ib_put_native_port_mdev(dev, port_num);
+
+ return 0;
+}
+
static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num,
struct mlx5_ib_uapi_query_port *info)
{
@@ -177,12 +195,60 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)(
ret = fill_switchdev_info(dev, port_num, &info);
if (ret)
return ret;
+ } else if (mlx5_core_mp_enabled(dev->mdev)) {
+ ret = fill_multiport_info(dev, port_num, &info);
+ if (ret)
+ return ret;
}
return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info,
sizeof(info));
}
+static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_data_direct_dev *data_direct_dev;
+ struct mlx5_ib_ucontext *c;
+ struct mlx5_ib_dev *dev;
+ int out_len = uverbs_attr_get_len(attrs,
+ MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH);
+ u32 dev_path_len;
+ char *dev_path;
+ int ret;
+
+ c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ dev = to_mdev(c->ibucontext.device);
+ mutex_lock(&dev->data_direct_lock);
+ data_direct_dev = dev->data_direct_dev;
+ if (!data_direct_dev) {
+ ret = -ENODEV;
+ goto end;
+ }
+
+ dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL);
+ if (!dev_path) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ dev_path_len = strlen(dev_path) + 1;
+ if (dev_path_len > out_len) {
+ ret = -ENOSPC;
+ goto end;
+ }
+
+ ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path,
+ dev_path_len);
+ kfree(dev_path);
+
+end:
+ mutex_unlock(&dev->data_direct_lock);
+ return ret;
+}
+
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_QUERY_PORT,
UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM,
@@ -193,9 +259,17 @@ DECLARE_UVERBS_NAMED_METHOD(
reg_c0),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD(
+ MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH,
+ UVERBS_ATTR_PTR_OUT(
+ MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH,
+ UVERBS_ATTR_MIN_SIZE(0),
+ UA_MANDATORY));
+
ADD_UVERBS_METHODS(mlx5_ib_device,
UVERBS_OBJECT_DEVICE,
- &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT));
+ &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT),
+ &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH));
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_PD_QUERY,
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index ffc31b01f690..887fd6fa3ba9 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -224,6 +224,9 @@ int mlx5r_umr_init(struct mlx5_ib_dev *dev)
void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
{
+ if (!dev->umrc.pd)
+ return;
+
mutex_destroy(&dev->umrc.init_lock);
ib_dealloc_pd(dev->umrc.pd);
}
@@ -632,44 +635,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
wqe->data_seg.byte_count = cpu_to_be32(sg->length);
}
-/*
- * Send the DMA list to the HW for a normal MR using UMR.
- * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
- * flag may be used.
- */
-int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+static int
+_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
{
+ size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
struct device *ddev = &dev->mdev->pdev->dev;
struct mlx5r_umr_wqe wqe = {};
struct ib_block_iter biter;
+ struct mlx5_ksm *cur_ksm;
struct mlx5_mtt *cur_mtt;
size_t orig_sg_length;
- struct mlx5_mtt *mtt;
size_t final_size;
+ void *curr_entry;
struct ib_sge sg;
+ void *entry;
u64 offset = 0;
int err = 0;
- if (WARN_ON(mr->umem->is_odp))
- return -EINVAL;
-
- mtt = mlx5r_umr_create_xlt(
- dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
- sizeof(*mtt), flags);
- if (!mtt)
+ entry = mlx5r_umr_create_xlt(dev, &sg,
+ ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
+ ent_size, flags);
+ if (!entry)
return -ENOMEM;
orig_sg_length = sg.length;
-
mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
mr->page_shift);
+ if (dd) {
+ /* Use the data direct internal kernel PD */
+ MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
+ cur_ksm = entry;
+ } else {
+ cur_mtt = entry;
+ }
+
mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
- cur_mtt = mtt;
+ curr_entry = entry;
rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
- if (cur_mtt == (void *)mtt + sg.length) {
+ if (curr_entry == entry + sg.length) {
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
@@ -681,23 +687,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
DMA_TO_DEVICE);
offset += sg.length;
mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
-
- cur_mtt = mtt;
+ if (dd)
+ cur_ksm = entry;
+ else
+ cur_mtt = entry;
}
- cur_mtt->ptag =
- cpu_to_be64(rdma_block_iter_dma_address(&biter) |
- MLX5_IB_MTT_PRESENT);
-
- if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
- cur_mtt->ptag = 0;
-
- cur_mtt++;
+ if (dd) {
+ cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
+ cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
+ cur_ksm++;
+ curr_entry = cur_ksm;
+ } else {
+ cur_mtt->ptag =
+ cpu_to_be64(rdma_block_iter_dma_address(&biter) |
+ MLX5_IB_MTT_PRESENT);
+ if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
+ cur_mtt->ptag = 0;
+ cur_mtt++;
+ curr_entry = cur_mtt;
+ }
}
- final_size = (void *)cur_mtt - (void *)mtt;
+ final_size = curr_entry - entry;
sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
- memset(cur_mtt, 0, sg.length - final_size);
+ memset(curr_entry, 0, sg.length - final_size);
mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
@@ -705,10 +719,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
err:
sg.length = orig_sg_length;
- mlx5r_umr_unmap_free_xlt(dev, mtt, &sg);
+ mlx5r_umr_unmap_free_xlt(dev, entry, &sg);
return err;
}
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+ /* No invalidation flow is expected */
+ if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP))
+ return -EINVAL;
+
+ return _mlx5r_umr_update_mr_pas(mr, flags, true);
+}
+
+/*
+ * Send the DMA list to the HW for a normal MR using UMR.
+ * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
+ * flag may be used.
+ */
+int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+ if (WARN_ON(mr->umem->is_odp))
+ return -EINVAL;
+
+ return _mlx5r_umr_update_mr_pas(mr, flags, false);
+}
+
static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
{
return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h
index 5f734dc72bef..4a02c9b5aad8 100644
--- a/drivers/infiniband/hw/mlx5/umr.h
+++ b/drivers/infiniband/hw/mlx5/umr.h
@@ -95,6 +95,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr);
int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
int access_flags);
int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags);
int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags);