Skip to content

Commit 6e04b10

Browse files
Devesh Sharmajgunthorpe
Devesh Sharma
authored andcommitted
RDMA/bnxt_re: Fix broken RoCE driver due to recent L2 driver changes
The recent changes in Broadcom's ethernet driver(L2 driver) broke RoCE functionality in terms of MSIx vector allocation and de-allocation. There is a possibility that L2 driver would initiate MSIx vector reallocation depending upon the requests coming from administrator. In such cases L2 driver needs to free up all the MSIx vectors allocated previously and reallocate/initialize those. If RoCE driver is loaded and reshuffling is attempted, there will be kernel crashes because RoCE driver would still be holding the MSIx vectors but L2 driver would attempt to free in-use vectors. Thus leading to a kernel crash. Making changes in roce driver to fix crashes described above. As part of solution L2 driver tells RoCE driver to release the MSIx vector whenever there is a need. When RoCE driver get message it sync up with all the running tasklets and IRQ handlers and releases the vectors. L2 driver send one more message to RoCE driver to resume the MSIx vectors. L2 driver guarantees that RoCE vector do not change during reshuffling. Fixes: ec86f14 ("bnxt_en: Add ULP calls to stop and restart IRQs.") Fixes: 08654eb ("bnxt_en: Change IRQ assignment for RDMA driver.") Signed-off-by: Devesh Sharma <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 55ba49c commit 6e04b10

File tree

5 files changed

+163
-53
lines changed

5 files changed

+163
-53
lines changed

drivers/infiniband/hw/bnxt_re/main.c

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,65 @@ static void bnxt_re_shutdown(void *p)
185185
bnxt_re_ib_unreg(rdev, false);
186186
}
187187

188+
static void bnxt_re_stop_irq(void *handle)
189+
{
190+
struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle;
191+
struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw;
192+
struct bnxt_qplib_nq *nq;
193+
int indx;
194+
195+
for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) {
196+
nq = &rdev->nq[indx - 1];
197+
bnxt_qplib_nq_stop_irq(nq, false);
198+
}
199+
200+
bnxt_qplib_rcfw_stop_irq(rcfw, false);
201+
}
202+
203+
static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent)
204+
{
205+
struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle;
206+
struct bnxt_msix_entry *msix_ent = rdev->msix_entries;
207+
struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw;
208+
struct bnxt_qplib_nq *nq;
209+
int indx, rc;
210+
211+
if (!ent) {
212+
/* Not setting the f/w timeout bit in rcfw.
213+
* During the driver unload the first command
214+
* to f/w will timeout and that will set the
215+
* timeout bit.
216+
*/
217+
dev_err(rdev_to_dev(rdev), "Failed to re-start IRQs\n");
218+
return;
219+
}
220+
221+
/* Vectors may change after restart, so update with new vectors
222+
* in device sctructure.
223+
*/
224+
for (indx = 0; indx < rdev->num_msix; indx++)
225+
rdev->msix_entries[indx].vector = ent[indx].vector;
226+
227+
bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector,
228+
false);
229+
for (indx = BNXT_RE_NQ_IDX ; indx < rdev->num_msix; indx++) {
230+
nq = &rdev->nq[indx - 1];
231+
rc = bnxt_qplib_nq_start_irq(nq, indx - 1,
232+
msix_ent[indx].vector, false);
233+
if (rc)
234+
dev_warn(rdev_to_dev(rdev),
235+
"Failed to reinit NQ index %d\n", indx - 1);
236+
}
237+
}
238+
188239
static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
189240
.ulp_async_notifier = NULL,
190241
.ulp_stop = bnxt_re_stop,
191242
.ulp_start = bnxt_re_start,
192243
.ulp_sriov_config = bnxt_re_sriov_config,
193-
.ulp_shutdown = bnxt_re_shutdown
244+
.ulp_shutdown = bnxt_re_shutdown,
245+
.ulp_irq_stop = bnxt_re_stop_irq,
246+
.ulp_irq_restart = bnxt_re_start_irq
194247
};
195248

196249
/* RoCE -> Net driver */

drivers/infiniband/hw/bnxt_re/qplib_fp.c

Lines changed: 60 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -336,22 +336,32 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
336336
return IRQ_HANDLED;
337337
}
338338

339+
void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
340+
{
341+
tasklet_disable(&nq->worker);
342+
/* Mask h/w interrupt */
343+
NQ_DB(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
344+
/* Sync with last running IRQ handler */
345+
synchronize_irq(nq->vector);
346+
if (kill)
347+
tasklet_kill(&nq->worker);
348+
if (nq->requested) {
349+
irq_set_affinity_hint(nq->vector, NULL);
350+
free_irq(nq->vector, nq);
351+
nq->requested = false;
352+
}
353+
}
354+
339355
void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
340356
{
341357
if (nq->cqn_wq) {
342358
destroy_workqueue(nq->cqn_wq);
343359
nq->cqn_wq = NULL;
344360
}
361+
345362
/* Make sure the HW is stopped! */
346-
synchronize_irq(nq->vector);
347-
tasklet_disable(&nq->worker);
348-
tasklet_kill(&nq->worker);
363+
bnxt_qplib_nq_stop_irq(nq, true);
349364

350-
if (nq->requested) {
351-
irq_set_affinity_hint(nq->vector, NULL);
352-
free_irq(nq->vector, nq);
353-
nq->requested = false;
354-
}
355365
if (nq->bar_reg_iomem)
356366
iounmap(nq->bar_reg_iomem);
357367
nq->bar_reg_iomem = NULL;
@@ -361,6 +371,40 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
361371
nq->vector = 0;
362372
}
363373

374+
int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
375+
int msix_vector, bool need_init)
376+
{
377+
int rc;
378+
379+
if (nq->requested)
380+
return -EFAULT;
381+
382+
nq->vector = msix_vector;
383+
if (need_init)
384+
tasklet_init(&nq->worker, bnxt_qplib_service_nq,
385+
(unsigned long)nq);
386+
else
387+
tasklet_enable(&nq->worker);
388+
389+
snprintf(nq->name, sizeof(nq->name), "bnxt_qplib_nq-%d", nq_indx);
390+
rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, nq->name, nq);
391+
if (rc)
392+
return rc;
393+
394+
cpumask_clear(&nq->mask);
395+
cpumask_set_cpu(nq_indx, &nq->mask);
396+
rc = irq_set_affinity_hint(nq->vector, &nq->mask);
397+
if (rc) {
398+
dev_warn(&nq->pdev->dev,
399+
"QPLIB: set affinity failed; vector: %d nq_idx: %d\n",
400+
nq->vector, nq_indx);
401+
}
402+
nq->requested = true;
403+
NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
404+
405+
return rc;
406+
}
407+
364408
int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
365409
int nq_idx, int msix_vector, int bar_reg_offset,
366410
int (*cqn_handler)(struct bnxt_qplib_nq *nq,
@@ -372,41 +416,17 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
372416
resource_size_t nq_base;
373417
int rc = -1;
374418

375-
nq->pdev = pdev;
376-
nq->vector = msix_vector;
377419
if (cqn_handler)
378420
nq->cqn_handler = cqn_handler;
379421

380422
if (srqn_handler)
381423
nq->srqn_handler = srqn_handler;
382424

383-
tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq);
384-
385425
/* Have a task to schedule CQ notifiers in post send case */
386426
nq->cqn_wq = create_singlethread_workqueue("bnxt_qplib_nq");
387427
if (!nq->cqn_wq)
388-
goto fail;
389-
390-
nq->requested = false;
391-
memset(nq->name, 0, 32);
392-
sprintf(nq->name, "bnxt_qplib_nq-%d", nq_idx);
393-
rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, nq->name, nq);
394-
if (rc) {
395-
dev_err(&nq->pdev->dev,
396-
"Failed to request IRQ for NQ: %#x", rc);
397-
goto fail;
398-
}
399-
400-
cpumask_clear(&nq->mask);
401-
cpumask_set_cpu(nq_idx, &nq->mask);
402-
rc = irq_set_affinity_hint(nq->vector, &nq->mask);
403-
if (rc) {
404-
dev_warn(&nq->pdev->dev,
405-
"QPLIB: set affinity failed; vector: %d nq_idx: %d\n",
406-
nq->vector, nq_idx);
407-
}
428+
return -ENOMEM;
408429

409-
nq->requested = true;
410430
nq->bar_reg = NQ_CONS_PCI_BAR_REGION;
411431
nq->bar_reg_off = bar_reg_offset;
412432
nq_base = pci_resource_start(pdev, nq->bar_reg);
@@ -419,7 +439,13 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
419439
rc = -ENOMEM;
420440
goto fail;
421441
}
422-
NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
442+
443+
rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true);
444+
if (rc) {
445+
dev_err(&nq->pdev->dev,
446+
"QPLIB: Failed to request irq for nq-idx %d", nq_idx);
447+
goto fail;
448+
}
423449

424450
return 0;
425451
fail:

drivers/infiniband/hw/bnxt_re/qplib_fp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,10 @@ struct bnxt_qplib_nq_work {
467467
struct bnxt_qplib_cq *cq;
468468
};
469469

470+
void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill);
470471
void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq);
472+
int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
473+
int msix_vector, bool need_init);
471474
int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
472475
int nq_idx, int msix_vector, int bar_reg_offset,
473476
int (*cqn_handler)(struct bnxt_qplib_nq *nq,

drivers/infiniband/hw/bnxt_re/qplib_rcfw.c

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -582,19 +582,29 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
582582
return -ENOMEM;
583583
}
584584

585-
void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
585+
void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
586586
{
587-
unsigned long indx;
588-
589-
/* Make sure the HW channel is stopped! */
590-
synchronize_irq(rcfw->vector);
591587
tasklet_disable(&rcfw->worker);
592-
tasklet_kill(&rcfw->worker);
588+
/* Mask h/w interrupts */
589+
CREQ_DB(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
590+
rcfw->creq.max_elements);
591+
/* Sync with last running IRQ-handler */
592+
synchronize_irq(rcfw->vector);
593+
if (kill)
594+
tasklet_kill(&rcfw->worker);
593595

594596
if (rcfw->requested) {
595597
free_irq(rcfw->vector, rcfw);
596598
rcfw->requested = false;
597599
}
600+
}
601+
602+
void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
603+
{
604+
unsigned long indx;
605+
606+
bnxt_qplib_rcfw_stop_irq(rcfw, true);
607+
598608
if (rcfw->cmdq_bar_reg_iomem)
599609
iounmap(rcfw->cmdq_bar_reg_iomem);
600610
rcfw->cmdq_bar_reg_iomem = NULL;
@@ -614,6 +624,31 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
614624
rcfw->vector = 0;
615625
}
616626

627+
int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
628+
bool need_init)
629+
{
630+
int rc;
631+
632+
if (rcfw->requested)
633+
return -EFAULT;
634+
635+
rcfw->vector = msix_vector;
636+
if (need_init)
637+
tasklet_init(&rcfw->worker,
638+
bnxt_qplib_service_creq, (unsigned long)rcfw);
639+
else
640+
tasklet_enable(&rcfw->worker);
641+
rc = request_irq(rcfw->vector, bnxt_qplib_creq_irq, 0,
642+
"bnxt_qplib_creq", rcfw);
643+
if (rc)
644+
return rc;
645+
rcfw->requested = true;
646+
CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
647+
rcfw->creq.max_elements);
648+
649+
return 0;
650+
}
651+
617652
int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
618653
struct bnxt_qplib_rcfw *rcfw,
619654
int msix_vector,
@@ -675,27 +710,17 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
675710
rcfw->creq_qp_event_processed = 0;
676711
rcfw->creq_func_event_processed = 0;
677712

678-
rcfw->vector = msix_vector;
679713
if (aeq_handler)
680714
rcfw->aeq_handler = aeq_handler;
715+
init_waitqueue_head(&rcfw->waitq);
681716

682-
tasklet_init(&rcfw->worker, bnxt_qplib_service_creq,
683-
(unsigned long)rcfw);
684-
685-
rcfw->requested = false;
686-
rc = request_irq(rcfw->vector, bnxt_qplib_creq_irq, 0,
687-
"bnxt_qplib_creq", rcfw);
717+
rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true);
688718
if (rc) {
689719
dev_err(&rcfw->pdev->dev,
690720
"QPLIB: Failed to request IRQ for CREQ rc = 0x%x", rc);
691721
bnxt_qplib_disable_rcfw_channel(rcfw);
692722
return rc;
693723
}
694-
rcfw->requested = true;
695-
696-
init_waitqueue_head(&rcfw->waitq);
697-
698-
CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, 0, rcfw->creq.max_elements);
699724

700725
init.cmdq_pbl = cpu_to_le64(rcfw->cmdq.pbl[PBL_LVL_0].pg_map_arr[0]);
701726
init.cmdq_size_cmdq_lvl = cpu_to_le16(

drivers/infiniband/hw/bnxt_re/qplib_rcfw.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,10 @@ struct bnxt_qplib_rcfw {
195195
void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
196196
int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
197197
struct bnxt_qplib_rcfw *rcfw, int qp_tbl_sz);
198+
void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill);
198199
void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
200+
int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
201+
bool need_init);
199202
int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
200203
struct bnxt_qplib_rcfw *rcfw,
201204
int msix_vector,

0 commit comments

Comments
 (0)