From 92d94d4d1779723fb5605f1d1fa235f86047da00 Mon Sep 17 00:00:00 2001 From: Himanshu Madhani Date: Thu, 1 Aug 2019 15:55:57 -0400 Subject: [PATCH 097/124] [scsi] scsi: qla2xxx: Fix race conditions in the code for aborting SCSI commands Message-id: <20190801155618.12650-98-hmadhani@redhat.com> Patchwork-id: 267896 O-Subject: [RHEL 7.8 e-stor PATCH 097/118] scsi: qla2xxx: Fix race conditions in the code for aborting SCSI commands Bugzilla: 1729270 RH-Acked-by: Jarod Wilson RH-Acked-by: Tony Camuso From: Bart Van Assche Bugzilla 1729270 In the *_done() functions, instead of returning early if sp->ref_count >= 2, only decrement sp->ref_count. In qla2xxx_eh_abort(), instead of deciding what to do based on the value of sp->ref_count, decide which action to take depending on the completion status of the firmware abort. Remove srb.cwaitq and use srb.comp instead. In qla2x00_abort_srb(), call isp_ops->abort_command() directly instead of calling qla2xxx_eh_abort(). Cc: Himanshu Madhani Cc: Giridhar Malavali Signed-off-by: Bart Van Assche Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen (cherry picked from commit 219d27d7147e07fe899a781bd72f9180b78c3852) Signed-off-by: Himanshu Madhani Conflicts: drivers/scsi/qla2xxx/qla_os.c [HM: RHEL78 kernel source does not have commit 25ab0bc334b4 ] [ ("scsi: sched/wait: Add wait_event_lock_irq_timeout for ] [ TASK_UNINTERRUPTIBLE usage"). Since this macro was missing ] [ commit 711a08d79f71 ("scsi: qla2xxx: Change abort wait_loop ] [ from msleep to wait_event_timeout") was not backported. ] [ This patch now removes code that was added by commit 711a08d79f71 ] [ Due to skipped commit code shows deviation from upstream ] [ in qla_os.c, qla2xxx_qpair_sp_compl() and qla2x00_sp_free_dma() ] [ brings in source from commit 711a08d79f71, to set the cmd->result ] [ and CMD_SP(cmd) also removes double qla2x00_rel_sp() ] Signed-off-by: Himanshu Madhani Signed-off-by: Jan Stancek --- drivers/scsi/qla2xxx/qla_nvme.c | 34 +-------- drivers/scsi/qla2xxx/qla_nvme.h | 1 - drivers/scsi/qla2xxx/qla_os.c | 150 ++++++++++++++++------------------------ 3 files changed, 62 insertions(+), 123 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 73d6b7833830..8ddd44bb6c7f 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -137,8 +137,7 @@ static void qla_nvme_sp_ls_done(void *ptr, int res) return; } - if (!atomic_dec_and_test(&sp->ref_count)) - return; + atomic_dec(&sp->ref_count); if (res) res = -EINVAL; @@ -161,8 +160,7 @@ static void qla_nvme_sp_done(void *ptr, int res) nvme = &sp->u.iocb_cmd; fd = nvme->u.nvme.desc; - if (!atomic_dec_and_test(&sp->ref_count)) - return; + atomic_dec(&sp->ref_count); if (res == QLA_SUCCESS) { fd->rcv_rsplen = nvme->u.nvme.rsp_pyld_len; @@ -611,34 +609,6 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = { .fcprqst_priv_sz = sizeof(struct nvme_private), }; -#define NVME_ABORT_POLLING_PERIOD 2 -static int qla_nvme_wait_on_command(srb_t *sp) -{ - int ret = QLA_SUCCESS; - - wait_event_timeout(sp->nvme_ls_waitq, (atomic_read(&sp->ref_count) > 1), - NVME_ABORT_POLLING_PERIOD*HZ); - - if (atomic_read(&sp->ref_count) > 1) - ret = QLA_FUNCTION_FAILED; - - return ret; -} - -void qla_nvme_abort(struct qla_hw_data *ha, struct srb *sp, int res) -{ - int rval; - - if (ha->flags.fw_started) { - rval = ha->isp_ops->abort_command(sp); - if (!rval && !qla_nvme_wait_on_command(sp)) - ql_log(ql_log_warn, NULL, 0x2112, - "timed out waiting on sp=%p\n", sp); - } else { - sp->done(sp, res); - } -} - static void qla_nvme_unregister_remote_port(struct work_struct *work) { struct fc_port *fcport = container_of(work, struct fc_port, diff --git a/drivers/scsi/qla2xxx/qla_nvme.h b/drivers/scsi/qla2xxx/qla_nvme.h index da8dad5ad693..0db04f0a4d5d 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.h +++ b/drivers/scsi/qla2xxx/qla_nvme.h @@ -145,7 +145,6 @@ struct pt_ls4_rx_unsol { int qla_nvme_register_hba(struct scsi_qla_host *); int qla_nvme_register_remote(struct scsi_qla_host *, struct fc_port *); void qla_nvme_delete(struct scsi_qla_host *); -void qla_nvme_abort(struct qla_hw_data *, struct srb *sp, int res); void qla24xx_nvme_ls4_iocb(struct scsi_qla_host *, struct pt_ls4_request *, struct req_que *); void qla24xx_async_gffid_sp_done(void *, int); diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index a13798c4d178..a32074dd4727 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -728,7 +728,7 @@ qla2x00_sp_free_dma(void *ptr) } if (!ctx) - goto end; + return; if (sp->flags & SRB_CRC_CTX_DSD_VALID) { /* List assured to be having elements */ @@ -753,12 +753,6 @@ qla2x00_sp_free_dma(void *ptr) ha->gbl_dsd_avail += ctx1->dsd_use_cnt; mempool_free(ctx1, ha->ctx_mempool); } - -end: - if (sp->type != SRB_NVME_CMD && sp->type != SRB_NVME_LS) { - CMD_SP(cmd) = NULL; - qla2x00_rel_sp(sp); - } } void @@ -766,6 +760,7 @@ qla2x00_sp_compl(void *ptr, int res) { srb_t *sp = ptr; struct scsi_cmnd *cmd = GET_CMD_SP(sp); + struct completion *comp = sp->comp; if (atomic_read(&sp->ref_count) == 0) { ql_dbg(ql_dbg_io, sp->vha, 0x3015, @@ -775,12 +770,15 @@ qla2x00_sp_compl(void *ptr, int res) WARN_ON(atomic_read(&sp->ref_count) == 0); return; } - if (!atomic_dec_and_test(&sp->ref_count)) - return; + + atomic_dec(&sp->ref_count); sp->free(sp); cmd->result = res; cmd->scsi_done(cmd); + if (comp) + complete(comp); + qla2x00_rel_sp(sp); } void @@ -803,7 +801,7 @@ qla2xxx_qpair_sp_free_dma(void *ptr) } if (!ctx) - goto end; + return; if (sp->flags & SRB_CRC_CTX_DSD_VALID) { /* List assured to be having elements */ @@ -865,10 +863,6 @@ qla2xxx_qpair_sp_free_dma(void *ptr) dma_pool_free(ha->dl_dma_pool, ctx, ctx0->crc_ctx_dma); sp->flags &= ~SRB_CRC_CTX_DMA_VALID; } - -end: - CMD_SP(cmd) = NULL; - qla2xxx_rel_qpair_sp(sp->qpair, sp); } void @@ -876,8 +870,7 @@ qla2xxx_qpair_sp_compl(void *ptr, int res) { srb_t *sp = ptr; struct scsi_cmnd *cmd = GET_CMD_SP(sp); - - cmd->result = res; + struct completion *comp = sp->comp; if (atomic_read(&sp->ref_count) == 0) { ql_dbg(ql_dbg_io, sp->fcport->vha, 0x3079, @@ -887,11 +880,16 @@ qla2xxx_qpair_sp_compl(void *ptr, int res) WARN_ON(atomic_read(&sp->ref_count) == 0); return; } - if (!atomic_dec_and_test(&sp->ref_count)) - return; + + atomic_dec(&sp->ref_count); sp->free(sp); + cmd->result = res; + CMD_SP(cmd) = NULL; cmd->scsi_done(cmd); + if (comp) + complete(comp); + qla2xxx_rel_qpair_sp(sp->qpair, sp); } static int @@ -1336,7 +1334,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) int ret; unsigned int id, lun; unsigned long flags; - int rval, wait = 0; + int rval; struct qla_hw_data *ha = vha->hw; struct qla_qpair *qpair; @@ -1349,7 +1347,6 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) ret = fc_block_scsi_eh(cmd); if (ret != 0) return ret; - ret = SUCCESS; sp = (srb_t *) CMD_SP(cmd); if (!sp) @@ -1360,7 +1357,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) return SUCCESS; spin_lock_irqsave(qpair->qp_lock_ptr, flags); - if (!CMD_SP(cmd)) { + if (sp->type != SRB_SCSI_CMD || GET_CMD_SP(sp) != cmd) { /* there's a chance an interrupt could clear the ptr as part of done & free */ spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); @@ -1381,58 +1378,31 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) "Aborting from RISC nexus=%ld:%d:%u sp=%p cmd=%p handle=%x\n", vha->host_no, id, lun, sp, cmd, sp->handle); - /* Get a reference to the sp and drop the lock.*/ - rval = ha->isp_ops->abort_command(sp); - if (rval) { - if (rval == QLA_FUNCTION_PARAMETER_ERROR) - ret = SUCCESS; - else - ret = FAILED; - - ql_dbg(ql_dbg_taskm, vha, 0x8003, - "Abort command mbx failed cmd=%p, rval=%x.\n", cmd, rval); - } else { - ql_dbg(ql_dbg_taskm, vha, 0x8004, - "Abort command mbx success cmd=%p.\n", cmd); - wait = 1; - } - - spin_lock_irqsave(qpair->qp_lock_ptr, flags); - /* - * Clear the slot in the oustanding_cmds array if we can't find the - * command to reclaim the resources. - */ - if (rval == QLA_FUNCTION_PARAMETER_ERROR) - vha->req->outstanding_cmds[sp->handle] = NULL; - - /* - * sp->done will do ref_count-- - * sp_get() took an extra count above - */ - sp->done(sp, DID_RESET << 16); + ql_dbg(ql_dbg_taskm, vha, 0x8003, + "Abort command mbx cmd=%p, rval=%x.\n", cmd, rval); - /* Did the command return during mailbox execution? */ - if (ret == FAILED && !CMD_SP(cmd)) + switch (rval) { + case QLA_SUCCESS: + /* + * The command has been aborted. That means that the firmware + * won't report a completion. + */ + sp->done(sp, DID_ABORT << 16); ret = SUCCESS; - - if (!CMD_SP(cmd)) - wait = 0; - - spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); - - /* Wait for the command to be returned. */ - if (wait) { - if (qla2x00_eh_wait_on_command(cmd) != QLA_SUCCESS) { - ql_log(ql_log_warn, vha, 0x8006, - "Abort handler timed out cmd=%p.\n", cmd); - ret = FAILED; - } + break; + default: + /* + * Either abort failed or abort and completion raced. Let + * the SCSI core retry the abort in the former case. + */ + ret = FAILED; + break; } ql_log(ql_log_info, vha, 0x801c, - "Abort command issued nexus=%ld:%d:%d -- %d %x.\n", - vha->host_no, id, lun, wait, ret); + "Abort command issued nexus=%ld:%d:%d -- %x.\n", + vha->host_no, id, lun, ret); return ret; } @@ -1806,34 +1776,34 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res, __releases(qp->qp_lock_ptr) __acquires(qp->qp_lock_ptr) { + DECLARE_COMPLETION_ONSTACK(comp); scsi_qla_host_t *vha = qp->vha; struct qla_hw_data *ha = vha->hw; + int rval; - if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS) { - if (!sp_get(sp)) { - /* got sp */ - spin_unlock_irqrestore(qp->qp_lock_ptr, *flags); - qla_nvme_abort(ha, sp, res); - spin_lock_irqsave(qp->qp_lock_ptr, *flags); - } - } else if (GET_CMD_SP(sp) && !ha->flags.eeh_busy && - !test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) && - !qla2x00_isp_reg_stat(ha) && sp->type == SRB_SCSI_CMD) { - /* - * Don't abort commands in adapter during EEH recovery as it's - * not accessible/responding. - * - * Get a reference to the sp and drop the lock. The reference - * ensures this sp->done() call and not the call in - * qla2xxx_eh_abort() ends the SCSI cmd (with result 'res'). - */ - if (!sp_get(sp)) { - spin_unlock_irqrestore(qp->qp_lock_ptr, *flags); - qla2xxx_eh_abort(GET_CMD_SP(sp)); - spin_lock_irqsave(qp->qp_lock_ptr, *flags); + if (sp_get(sp)) + return; + + if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS || + (sp->type == SRB_SCSI_CMD && !ha->flags.eeh_busy && + !test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) && + !qla2x00_isp_reg_stat(ha))) { + sp->comp = ∁ + rval = ha->isp_ops->abort_command(sp); + spin_unlock_irqrestore(qp->qp_lock_ptr, *flags); + + switch (rval) { + case QLA_SUCCESS: + sp->done(sp, res); + break; + case QLA_FUNCTION_PARAMETER_ERROR: + wait_for_completion(&comp); + break; } + + spin_lock_irqsave(qp->qp_lock_ptr, *flags); + sp->comp = NULL; } - sp->done(sp, res); } static void -- 2.13.6