Blob Blame History Raw
From c751e5890724beeb0b41017f54ac5509f50697e8 Mon Sep 17 00:00:00 2001
From: Alaa Hleihel <ahleihel@redhat.com>
Date: Mon, 1 Jun 2020 15:40:31 -0400
Subject: [PATCH 278/312] [netdrv] net/mlx5: Add command entry handling
 completion

Message-id: <20200601154102.25980-9-ahleihel@redhat.com>
Patchwork-id: 315713
Patchwork-instance: patchwork
O-Subject: [RHEL8.3 BZ 1842258 08/39] net/mlx5: Add command entry handling completion
Bugzilla: 1842258
RH-Acked-by: Honggang Li <honli@redhat.com>
RH-Acked-by: Kamal Heib <kheib@redhat.com>
RH-Acked-by: Marcelo Leitner <mleitner@redhat.com>
RH-Acked-by: Jarod Wilson <jarod@redhat.com>

Bugzilla: http://bugzilla.redhat.com/1842258
Upstream: v5.7-rc7

commit 17d00e839d3b592da9659c1977d45f85b77f986a
Author: Moshe Shemesh <moshe@mellanox.com>
Date:   Fri Dec 27 07:01:53 2019 +0200

    net/mlx5: Add command entry handling completion

    When FW response to commands is very slow and all command entries in
    use are waiting for completion we can have a race where commands can get
    timeout before they get out of the queue and handled. Timeout
    completion on uninitialized command will cause releasing command's
    buffers before accessing it for initialization and then we will get NULL
    pointer exception while trying access it. It may also cause releasing
    buffers of another command since we may have timeout completion before
    even allocating entry index for this command.
    Add entry handling completion to avoid this race.

    Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
    Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
    Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
    Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

Signed-off-by: Alaa Hleihel <ahleihel@redhat.com>
Signed-off-by: Frantisek Hrbata <fhrbata@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 14 ++++++++++++++
 include/linux/mlx5/driver.h                   |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 23acec5a31d4..50783828d2e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -861,6 +861,7 @@ static void cmd_work_handler(struct work_struct *work)
 	int alloc_ret;
 	int cmd_mode;
 
+	complete(&ent->handling);
 	sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
 	down(sem);
 	if (!ent->page_queue) {
@@ -978,6 +979,11 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
 	struct mlx5_cmd *cmd = &dev->cmd;
 	int err;
 
+	if (!wait_for_completion_timeout(&ent->handling, timeout) &&
+	    cancel_work_sync(&ent->work)) {
+		ent->ret = -ECANCELED;
+		goto out_err;
+	}
 	if (cmd->mode == CMD_MODE_POLLING || ent->polling) {
 		wait_for_completion(&ent->done);
 	} else if (!wait_for_completion_timeout(&ent->done, timeout)) {
@@ -985,12 +991,17 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
 		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
 	}
 
+out_err:
 	err = ent->ret;
 
 	if (err == -ETIMEDOUT) {
 		mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
 			       mlx5_command_str(msg_to_opcode(ent->in)),
 			       msg_to_opcode(ent->in));
+	} else if (err == -ECANCELED) {
+		mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n",
+			       mlx5_command_str(msg_to_opcode(ent->in)),
+			       msg_to_opcode(ent->in));
 	}
 	mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
 		      err, deliv_status_to_str(ent->status), ent->status);
@@ -1026,6 +1037,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
 	ent->token = token;
 	ent->polling = force_polling;
 
+	init_completion(&ent->handling);
 	if (!callback)
 		init_completion(&ent->done);
 
@@ -1045,6 +1057,8 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
 	err = wait_func(dev, ent);
 	if (err == -ETIMEDOUT)
 		goto out;
+	if (err == -ECANCELED)
+		goto out_free;
 
 	ds = ent->ts2 - ent->ts1;
 	op = MLX5_GET(mbox_in, in->first.data, opcode);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 0d728007078c..df47476d6fca 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -761,6 +761,9 @@ struct mlx5_cmd_work_ent {
 	struct delayed_work	cb_timeout_work;
 	void		       *context;
 	int			idx;
+#ifndef __GENKSYMS__
+	struct completion	handling;
+#endif
 	struct completion	done;
 	struct mlx5_cmd        *cmd;
 	struct work_struct	work;
-- 
2.13.6