Skip to content

Commit 5256edc

Browse files
committed
RDMA/mlx5: Rework implicit ODP destroy
Use SRCU in a sensible way by removing all MRs in the implicit tree from the two xarrays (the update operation), then a synchronize, followed by a normal single threaded teardown. This is only a little unusual from the normal pattern as there can still be some work pending in the unbound wq that may also require a workqueue flush. This is tracked with a single atomic, consolidating the redundant existing atomics and wait queue. For understand-ability the entire ODP implicit create/destroy flow now largely exists in a single pair of functions within odp.c, with a few support functions for tearing down an unused child. Link: https://lore.kernel.org/r/[email protected] Reviewed-by: Artemy Kovalyov <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent b70d785 commit 5256edc

File tree

5 files changed

+120
-66
lines changed

5 files changed

+120
-66
lines changed

drivers/infiniband/hw/mlx5/main.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6146,8 +6146,6 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
61466146
{
61476147
mlx5_ib_cleanup_multiport_master(dev);
61486148
WARN_ON(!xa_empty(&dev->odp_mkeys));
6149-
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
6150-
srcu_barrier(&dev->odp_srcu);
61516149
cleanup_srcu_struct(&dev->odp_srcu);
61526150

61536151
WARN_ON(!xa_empty(&dev->sig_mrs));

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -618,10 +618,13 @@ struct mlx5_ib_mr {
618618
u64 pi_iova;
619619

620620
/* For ODP and implicit */
621-
atomic_t num_leaf_free;
622-
wait_queue_head_t q_leaf_free;
623-
atomic_t num_pending_prefetch;
621+
atomic_t num_deferred_work;
624622
struct xarray implicit_children;
623+
union {
624+
struct rcu_head rcu;
625+
struct list_head elm;
626+
struct work_struct work;
627+
} odp_destroy;
625628

626629
struct mlx5_async_work cb_work;
627630
};

drivers/infiniband/hw/mlx5/mr.c

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,7 +1317,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
13171317

13181318
if (is_odp_mr(mr)) {
13191319
to_ib_umem_odp(mr->umem)->private = mr;
1320-
atomic_set(&mr->num_pending_prefetch, 0);
1320+
atomic_set(&mr->num_deferred_work, 0);
13211321
err = xa_err(xa_store(&dev->odp_mkeys,
13221322
mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
13231323
GFP_KERNEL));
@@ -1573,17 +1573,15 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
15731573
synchronize_srcu(&dev->odp_srcu);
15741574

15751575
/* dequeue pending prefetch requests for the mr */
1576-
if (atomic_read(&mr->num_pending_prefetch))
1576+
if (atomic_read(&mr->num_deferred_work)) {
15771577
flush_workqueue(system_unbound_wq);
1578-
WARN_ON(atomic_read(&mr->num_pending_prefetch));
1578+
WARN_ON(atomic_read(&mr->num_deferred_work));
1579+
}
15791580

15801581
/* Destroy all page mappings */
1581-
if (!umem_odp->is_implicit_odp)
1582-
mlx5_ib_invalidate_range(umem_odp,
1583-
ib_umem_start(umem_odp),
1584-
ib_umem_end(umem_odp));
1585-
else
1586-
mlx5_ib_free_implicit_mr(mr);
1582+
mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem_odp),
1583+
ib_umem_end(umem_odp));
1584+
15871585
/*
15881586
* We kill the umem before the MR for ODP,
15891587
* so that there will not be any invalidations in
@@ -1620,6 +1618,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
16201618
dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
16211619
}
16221620

1621+
if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
1622+
mlx5_ib_free_implicit_mr(mmr);
1623+
return 0;
1624+
}
1625+
16231626
dereg_mr(to_mdev(ibmr->device), mmr);
16241627

16251628
return 0;

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 102 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -144,31 +144,79 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
144144
}
145145
}
146146

147-
static void mr_leaf_free_action(struct work_struct *work)
147+
/*
148+
* This must be called after the mr has been removed from implicit_children
149+
* and odp_mkeys and the SRCU synchronized. NOTE: The MR does not necessarily
150+
* have to be empty here, parallel page faults could have raced with the free
151+
* process and added pages to it.
152+
*/
153+
static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
148154
{
149-
struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
150-
int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
151-
struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
155+
struct mlx5_ib_mr *imr = mr->parent;
152156
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
157+
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
158+
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
153159
int srcu_key;
154160

155-
mr->parent = NULL;
156-
synchronize_srcu(&mr->dev->odp_srcu);
161+
/* implicit_child_mr's are not allowed to have deferred work */
162+
WARN_ON(atomic_read(&mr->num_deferred_work));
157163

158-
if (xa_load(&mr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key))) {
164+
if (need_imr_xlt) {
159165
srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
160166
mutex_lock(&odp_imr->umem_mutex);
161-
mlx5_ib_update_xlt(imr, idx, 1, 0,
167+
mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
162168
MLX5_IB_UPD_XLT_INDIRECT |
163169
MLX5_IB_UPD_XLT_ATOMIC);
164170
mutex_unlock(&odp_imr->umem_mutex);
165171
srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
166172
}
167-
ib_umem_odp_release(odp);
173+
174+
mr->parent = NULL;
168175
mlx5_mr_cache_free(mr->dev, mr);
176+
ib_umem_odp_release(odp);
177+
atomic_dec(&imr->num_deferred_work);
178+
}
179+
180+
static void free_implicit_child_mr_work(struct work_struct *work)
181+
{
182+
struct mlx5_ib_mr *mr =
183+
container_of(work, struct mlx5_ib_mr, odp_destroy.work);
184+
185+
free_implicit_child_mr(mr, true);
186+
}
187+
188+
static void free_implicit_child_mr_rcu(struct rcu_head *head)
189+
{
190+
struct mlx5_ib_mr *mr =
191+
container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
192+
193+
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
194+
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
195+
queue_work(system_unbound_wq, &mr->odp_destroy.work);
196+
}
197+
198+
static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
199+
{
200+
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
201+
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
202+
struct mlx5_ib_mr *imr = mr->parent;
169203

170-
if (atomic_dec_and_test(&imr->num_leaf_free))
171-
wake_up(&imr->q_leaf_free);
204+
xa_lock(&imr->implicit_children);
205+
/*
206+
* This can race with mlx5_ib_free_implicit_mr(), the first one to
207+
* reach the xa lock wins the race and destroys the MR.
208+
*/
209+
if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
210+
mr)
211+
goto out_unlock;
212+
213+
__xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
214+
atomic_inc(&imr->num_deferred_work);
215+
call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu,
216+
free_implicit_child_mr_rcu);
217+
218+
out_unlock:
219+
xa_unlock(&imr->implicit_children);
172220
}
173221

174222
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
@@ -240,15 +288,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
240288

241289
ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
242290

243-
if (unlikely(!umem_odp->npages && mr->parent &&
244-
!umem_odp->dying)) {
245-
xa_erase(&mr->parent->implicit_children,
246-
ib_umem_start(umem_odp) >> MLX5_IMR_MTT_SHIFT);
247-
xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
248-
umem_odp->dying = 1;
249-
atomic_inc(&mr->parent->num_leaf_free);
250-
schedule_work(&umem_odp->work);
251-
}
291+
if (unlikely(!umem_odp->npages && mr->parent))
292+
destroy_unused_implicit_child_mr(mr);
252293
mutex_unlock(&umem_odp->umem_mutex);
253294
}
254295

@@ -375,7 +416,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
375416
mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
376417
mr->parent = imr;
377418
odp->private = mr;
378-
INIT_WORK(&odp->work, mr_leaf_free_action);
379419

380420
err = mlx5_ib_update_xlt(mr, 0,
381421
MLX5_IMR_MTT_ENTRIES,
@@ -391,7 +431,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
391431
* Once the store to either xarray completes any error unwind has to
392432
* use synchronize_srcu(). Avoid this with xa_reserve()
393433
*/
394-
ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, GFP_KERNEL);
434+
ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
435+
GFP_KERNEL);
436+
if (likely(!ret))
437+
xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
438+
&mr->mmkey, GFP_ATOMIC);
395439
if (unlikely(ret)) {
396440
if (xa_is_err(ret)) {
397441
ret = ERR_PTR(xa_err(ret));
@@ -404,9 +448,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
404448
goto out_release;
405449
}
406450

407-
xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
408-
&mr->mmkey, GFP_ATOMIC);
409-
410451
mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr);
411452
return mr;
412453

@@ -445,9 +486,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
445486
imr->ibmr.lkey = imr->mmkey.key;
446487
imr->ibmr.rkey = imr->mmkey.key;
447488
imr->umem = &umem_odp->umem;
448-
init_waitqueue_head(&imr->q_leaf_free);
449-
atomic_set(&imr->num_leaf_free, 0);
450-
atomic_set(&imr->num_pending_prefetch, 0);
489+
atomic_set(&imr->num_deferred_work, 0);
451490
xa_init(&imr->implicit_children);
452491

453492
err = mlx5_ib_update_xlt(imr, 0,
@@ -477,35 +516,48 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
477516
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
478517
{
479518
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
519+
struct mlx5_ib_dev *dev = imr->dev;
520+
struct list_head destroy_list;
480521
struct mlx5_ib_mr *mtt;
522+
struct mlx5_ib_mr *tmp;
481523
unsigned long idx;
482524

483-
mutex_lock(&odp_imr->umem_mutex);
484-
xa_for_each (&imr->implicit_children, idx, mtt) {
485-
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem);
525+
INIT_LIST_HEAD(&destroy_list);
486526

487-
xa_erase(&imr->implicit_children, idx);
527+
xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
528+
/*
529+
* This stops the SRCU protected page fault path from touching either
530+
* the imr or any children. The page fault path can only reach the
531+
* children xarray via the imr.
532+
*/
533+
synchronize_srcu(&dev->odp_srcu);
488534

489-
mutex_lock(&umem_odp->umem_mutex);
490-
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
491-
ib_umem_end(umem_odp));
535+
xa_lock(&imr->implicit_children);
536+
xa_for_each (&imr->implicit_children, idx, mtt) {
537+
__xa_erase(&imr->implicit_children, idx);
538+
__xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key));
539+
list_add(&mtt->odp_destroy.elm, &destroy_list);
540+
}
541+
xa_unlock(&imr->implicit_children);
492542

493-
if (umem_odp->dying) {
494-
mutex_unlock(&umem_odp->umem_mutex);
495-
continue;
496-
}
543+
/* Fence access to the child pointers via the pagefault thread */
544+
synchronize_srcu(&dev->odp_srcu);
497545

498-
umem_odp->dying = 1;
499-
atomic_inc(&imr->num_leaf_free);
500-
schedule_work(&umem_odp->work);
501-
mutex_unlock(&umem_odp->umem_mutex);
546+
/*
547+
* num_deferred_work can only be incremented inside the odp_srcu, or
548+
* under xa_lock while the child is in the xarray. Thus at this point
549+
* it is only decreasing, and all work holding it is now on the wq.
550+
*/
551+
if (atomic_read(&imr->num_deferred_work)) {
552+
flush_workqueue(system_unbound_wq);
553+
WARN_ON(atomic_read(&imr->num_deferred_work));
502554
}
503-
mutex_unlock(&odp_imr->umem_mutex);
504555

505-
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
506-
WARN_ON(!xa_empty(&imr->implicit_children));
507-
/* Remove any left over reserved elements */
508-
xa_destroy(&imr->implicit_children);
556+
list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
557+
free_implicit_child_mr(mtt, false);
558+
559+
mlx5_mr_cache_free(dev, imr);
560+
ib_umem_odp_release(odp_imr);
509561
}
510562

511563
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
@@ -1579,7 +1631,7 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
15791631
u32 i;
15801632

15811633
for (i = 0; i < work->num_sge; ++i)
1582-
atomic_dec(&work->frags[i].mr->num_pending_prefetch);
1634+
atomic_dec(&work->frags[i].mr->num_deferred_work);
15831635
kvfree(work);
15841636
}
15851637

@@ -1658,7 +1710,7 @@ static bool init_prefetch_work(struct ib_pd *pd,
16581710
}
16591711

16601712
/* Keep the MR pointer will valid outside the SRCU */
1661-
atomic_inc(&work->frags[i].mr->num_pending_prefetch);
1713+
atomic_inc(&work->frags[i].mr->num_deferred_work);
16621714
}
16631715
work->num_sge = num_sge;
16641716
return true;

include/rdma/ib_umem_odp.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,7 @@ struct ib_umem_odp {
7878
bool is_implicit_odp;
7979

8080
struct completion notifier_completion;
81-
int dying;
8281
unsigned int page_shift;
83-
struct work_struct work;
8482
};
8583

8684
static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)

0 commit comments

Comments
 (0)