Skip to content

Commit b70d785

Browse files
committed
RDMA/mlx5: Avoid double lookups on the pagefault path
Now that the locking is simplified combine pagefault_implicit_mr() with implicit_mr_get_data() so that we sweep over the idx range only once, and do the single xlt update at the end, after the child umems are setup. This avoids double iteration/xa_loads plus the sketchy failure path if the xa_load() fails. Link: https://lore.kernel.org/r/[email protected] Reviewed-by: Artemy Kovalyov <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 3389baa commit b70d785

File tree

1 file changed

+80
-106
lines changed
  • drivers/infiniband/hw/mlx5

1 file changed

+80
-106
lines changed

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 80 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -419,68 +419,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
419419
return ret;
420420
}
421421

422-
static struct mlx5_ib_mr *implicit_mr_get_data(struct mlx5_ib_mr *imr,
423-
u64 io_virt, size_t bcnt)
424-
{
425-
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
426-
unsigned long end_idx = (io_virt + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
427-
unsigned long idx = io_virt >> MLX5_IMR_MTT_SHIFT;
428-
unsigned long inv_start_idx = end_idx + 1;
429-
unsigned long inv_len = 0;
430-
struct mlx5_ib_mr *result = NULL;
431-
int ret;
432-
433-
lockdep_assert_held(&imr->dev->odp_srcu);
434-
435-
for (idx = idx; idx <= end_idx; idx++) {
436-
struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
437-
438-
if (unlikely(!mtt)) {
439-
mtt = implicit_get_child_mr(imr, idx);
440-
if (IS_ERR(mtt)) {
441-
result = mtt;
442-
goto out;
443-
}
444-
inv_start_idx = min(inv_start_idx, idx);
445-
inv_len = idx - inv_start_idx + 1;
446-
}
447-
448-
/* Return first odp if region not covered by single one */
449-
if (likely(!result))
450-
result = mtt;
451-
}
452-
453-
/*
454-
* Any time the implicit_children are changed we must perform an
455-
* update of the xlt before exiting to ensure the HW and the
456-
* implicit_children remains synchronized.
457-
*/
458-
out:
459-
if (likely(!inv_len))
460-
return result;
461-
462-
/*
463-
* Notice this is not strictly ordered right, the KSM is updated after
464-
* the implicit_leaves is updated, so a parallel page fault could see
465-
* a MR that is not yet visible in the KSM. This is similar to a
466-
* parallel page fault seeing a MR that is being concurrently removed
467-
* from the KSM. Both of these improbable situations are resolved
468-
* safely by resuming the HW and then taking another page fault. The
469-
* next pagefault handler will see the new information.
470-
*/
471-
mutex_lock(&odp_imr->umem_mutex);
472-
ret = mlx5_ib_update_xlt(imr, inv_start_idx, inv_len, 0,
473-
MLX5_IB_UPD_XLT_INDIRECT |
474-
MLX5_IB_UPD_XLT_ATOMIC);
475-
mutex_unlock(&odp_imr->umem_mutex);
476-
if (ret) {
477-
mlx5_ib_err(to_mdev(imr->ibmr.pd->device),
478-
"Failed to update PAS\n");
479-
return ERR_PTR(ret);
480-
}
481-
return result;
482-
}
483-
484422
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
485423
struct ib_udata *udata,
486424
int access_flags)
@@ -647,6 +585,84 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
647585
return ret;
648586
}
649587

588+
static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
589+
struct ib_umem_odp *odp_imr, u64 user_va,
590+
size_t bcnt, u32 *bytes_mapped, u32 flags)
591+
{
592+
unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
593+
unsigned long upd_start_idx = end_idx + 1;
594+
unsigned long upd_len = 0;
595+
unsigned long npages = 0;
596+
int err;
597+
int ret;
598+
599+
if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
600+
mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
601+
return -EFAULT;
602+
603+
/* Fault each child mr that intersects with our interval. */
604+
while (bcnt) {
605+
unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
606+
struct ib_umem_odp *umem_odp;
607+
struct mlx5_ib_mr *mtt;
608+
u64 len;
609+
610+
mtt = xa_load(&imr->implicit_children, idx);
611+
if (unlikely(!mtt)) {
612+
mtt = implicit_get_child_mr(imr, idx);
613+
if (IS_ERR(mtt)) {
614+
ret = PTR_ERR(mtt);
615+
goto out;
616+
}
617+
upd_start_idx = min(upd_start_idx, idx);
618+
upd_len = idx - upd_start_idx + 1;
619+
}
620+
621+
umem_odp = to_ib_umem_odp(mtt->umem);
622+
len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
623+
user_va;
624+
625+
ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
626+
bytes_mapped, flags);
627+
if (ret < 0)
628+
goto out;
629+
user_va += len;
630+
bcnt -= len;
631+
npages += ret;
632+
}
633+
634+
ret = npages;
635+
636+
/*
637+
* Any time the implicit_children are changed we must perform an
638+
* update of the xlt before exiting to ensure the HW and the
639+
* implicit_children remains synchronized.
640+
*/
641+
out:
642+
if (likely(!upd_len))
643+
return ret;
644+
645+
/*
646+
* Notice this is not strictly ordered right, the KSM is updated after
647+
* the implicit_children is updated, so a parallel page fault could
648+
* see a MR that is not yet visible in the KSM. This is similar to a
649+
* parallel page fault seeing a MR that is being concurrently removed
650+
* from the KSM. Both of these improbable situations are resolved
651+
* safely by resuming the HW and then taking another page fault. The
652+
* next pagefault handler will see the new information.
653+
*/
654+
mutex_lock(&odp_imr->umem_mutex);
655+
err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
656+
MLX5_IB_UPD_XLT_INDIRECT |
657+
MLX5_IB_UPD_XLT_ATOMIC);
658+
mutex_unlock(&odp_imr->umem_mutex);
659+
if (err) {
660+
mlx5_ib_err(imr->dev, "Failed to update PAS\n");
661+
return err;
662+
}
663+
return ret;
664+
}
665+
650666
/*
651667
* Returns:
652668
* -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
@@ -660,8 +676,6 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
660676
u32 *bytes_mapped, u32 flags)
661677
{
662678
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
663-
struct mlx5_ib_mr *mtt;
664-
int npages = 0;
665679

666680
if (!odp->is_implicit_odp) {
667681
if (unlikely(io_virt < ib_umem_start(odp) ||
@@ -670,48 +684,8 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
670684
return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped,
671685
flags);
672686
}
673-
674-
if (unlikely(io_virt >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
675-
mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - io_virt < bcnt))
676-
return -EFAULT;
677-
678-
mtt = implicit_mr_get_data(mr, io_virt, bcnt);
679-
if (IS_ERR(mtt))
680-
return PTR_ERR(mtt);
681-
682-
/* Fault each child mr that intersects with our interval. */
683-
while (bcnt) {
684-
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem);
685-
u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(umem_odp));
686-
u64 len = end - io_virt;
687-
int ret;
688-
689-
ret = pagefault_real_mr(mtt, umem_odp, io_virt, len,
690-
bytes_mapped, flags);
691-
if (ret < 0)
692-
return ret;
693-
io_virt += len;
694-
bcnt -= len;
695-
npages += ret;
696-
697-
if (unlikely(bcnt)) {
698-
mtt = xa_load(&mr->implicit_children,
699-
io_virt >> MLX5_IMR_MTT_SHIFT);
700-
701-
/*
702-
* implicit_mr_get_data sets up all the leaves, this
703-
* means they got invalidated before we got to them.
704-
*/
705-
if (!mtt) {
706-
mlx5_ib_dbg(
707-
mr->dev,
708-
"next implicit leaf removed at 0x%llx.\n",
709-
io_virt);
710-
return -EAGAIN;
711-
}
712-
}
713-
}
714-
return npages;
687+
return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
688+
flags);
715689
}
716690

717691
struct pf_frame {

0 commit comments

Comments
 (0)