@@ -144,31 +144,79 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
144
144
}
145
145
}
146
146
147
- static void mr_leaf_free_action (struct work_struct * work )
147
+ /*
148
+ * This must be called after the mr has been removed from implicit_children
149
+ * and odp_mkeys and the SRCU synchronized. NOTE: The MR does not necessarily
150
+ * have to be empty here, parallel page faults could have raced with the free
151
+ * process and added pages to it.
152
+ */
153
+ static void free_implicit_child_mr (struct mlx5_ib_mr * mr , bool need_imr_xlt )
148
154
{
149
- struct ib_umem_odp * odp = container_of (work , struct ib_umem_odp , work );
150
- int idx = ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
151
- struct mlx5_ib_mr * mr = odp -> private , * imr = mr -> parent ;
155
+ struct mlx5_ib_mr * imr = mr -> parent ;
152
156
struct ib_umem_odp * odp_imr = to_ib_umem_odp (imr -> umem );
157
+ struct ib_umem_odp * odp = to_ib_umem_odp (mr -> umem );
158
+ unsigned long idx = ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
153
159
int srcu_key ;
154
160
155
- mr -> parent = NULL ;
156
- synchronize_srcu ( & mr -> dev -> odp_srcu );
161
+ /* implicit_child_mr's are not allowed to have deferred work */
162
+ WARN_ON ( atomic_read ( & mr -> num_deferred_work ) );
157
163
158
- if (xa_load ( & mr -> dev -> odp_mkeys , mlx5_base_mkey ( imr -> mmkey . key )) ) {
164
+ if (need_imr_xlt ) {
159
165
srcu_key = srcu_read_lock (& mr -> dev -> odp_srcu );
160
166
mutex_lock (& odp_imr -> umem_mutex );
161
- mlx5_ib_update_xlt (imr , idx , 1 , 0 ,
167
+ mlx5_ib_update_xlt (mr -> parent , idx , 1 , 0 ,
162
168
MLX5_IB_UPD_XLT_INDIRECT |
163
169
MLX5_IB_UPD_XLT_ATOMIC );
164
170
mutex_unlock (& odp_imr -> umem_mutex );
165
171
srcu_read_unlock (& mr -> dev -> odp_srcu , srcu_key );
166
172
}
167
- ib_umem_odp_release (odp );
173
+
174
+ mr -> parent = NULL ;
168
175
mlx5_mr_cache_free (mr -> dev , mr );
176
+ ib_umem_odp_release (odp );
177
+ atomic_dec (& imr -> num_deferred_work );
178
+ }
179
+
180
+ static void free_implicit_child_mr_work (struct work_struct * work )
181
+ {
182
+ struct mlx5_ib_mr * mr =
183
+ container_of (work , struct mlx5_ib_mr , odp_destroy .work );
184
+
185
+ free_implicit_child_mr (mr , true);
186
+ }
187
+
188
+ static void free_implicit_child_mr_rcu (struct rcu_head * head )
189
+ {
190
+ struct mlx5_ib_mr * mr =
191
+ container_of (head , struct mlx5_ib_mr , odp_destroy .rcu );
192
+
193
+ /* Freeing a MR is a sleeping operation, so bounce to a work queue */
194
+ INIT_WORK (& mr -> odp_destroy .work , free_implicit_child_mr_work );
195
+ queue_work (system_unbound_wq , & mr -> odp_destroy .work );
196
+ }
197
+
198
+ static void destroy_unused_implicit_child_mr (struct mlx5_ib_mr * mr )
199
+ {
200
+ struct ib_umem_odp * odp = to_ib_umem_odp (mr -> umem );
201
+ unsigned long idx = ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
202
+ struct mlx5_ib_mr * imr = mr -> parent ;
169
203
170
- if (atomic_dec_and_test (& imr -> num_leaf_free ))
171
- wake_up (& imr -> q_leaf_free );
204
+ xa_lock (& imr -> implicit_children );
205
+ /*
206
+ * This can race with mlx5_ib_free_implicit_mr(), the first one to
207
+ * reach the xa lock wins the race and destroys the MR.
208
+ */
209
+ if (__xa_cmpxchg (& imr -> implicit_children , idx , mr , NULL , GFP_ATOMIC ) !=
210
+ mr )
211
+ goto out_unlock ;
212
+
213
+ __xa_erase (& mr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ));
214
+ atomic_inc (& imr -> num_deferred_work );
215
+ call_srcu (& mr -> dev -> odp_srcu , & mr -> odp_destroy .rcu ,
216
+ free_implicit_child_mr_rcu );
217
+
218
+ out_unlock :
219
+ xa_unlock (& imr -> implicit_children );
172
220
}
173
221
174
222
void mlx5_ib_invalidate_range (struct ib_umem_odp * umem_odp , unsigned long start ,
@@ -240,15 +288,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
240
288
241
289
ib_umem_odp_unmap_dma_pages (umem_odp , start , end );
242
290
243
- if (unlikely (!umem_odp -> npages && mr -> parent &&
244
- !umem_odp -> dying )) {
245
- xa_erase (& mr -> parent -> implicit_children ,
246
- ib_umem_start (umem_odp ) >> MLX5_IMR_MTT_SHIFT );
247
- xa_erase (& mr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ));
248
- umem_odp -> dying = 1 ;
249
- atomic_inc (& mr -> parent -> num_leaf_free );
250
- schedule_work (& umem_odp -> work );
251
- }
291
+ if (unlikely (!umem_odp -> npages && mr -> parent ))
292
+ destroy_unused_implicit_child_mr (mr );
252
293
mutex_unlock (& umem_odp -> umem_mutex );
253
294
}
254
295
@@ -375,7 +416,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
375
416
mr -> mmkey .iova = idx * MLX5_IMR_MTT_SIZE ;
376
417
mr -> parent = imr ;
377
418
odp -> private = mr ;
378
- INIT_WORK (& odp -> work , mr_leaf_free_action );
379
419
380
420
err = mlx5_ib_update_xlt (mr , 0 ,
381
421
MLX5_IMR_MTT_ENTRIES ,
@@ -391,7 +431,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
391
431
* Once the store to either xarray completes any error unwind has to
392
432
* use synchronize_srcu(). Avoid this with xa_reserve()
393
433
*/
394
- ret = xa_cmpxchg (& imr -> implicit_children , idx , NULL , mr , GFP_KERNEL );
434
+ ret = xa_cmpxchg (& imr -> implicit_children , idx , NULL , mr ,
435
+ GFP_KERNEL );
436
+ if (likely (!ret ))
437
+ xa_store (& imr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ),
438
+ & mr -> mmkey , GFP_ATOMIC );
395
439
if (unlikely (ret )) {
396
440
if (xa_is_err (ret )) {
397
441
ret = ERR_PTR (xa_err (ret ));
@@ -404,9 +448,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
404
448
goto out_release ;
405
449
}
406
450
407
- xa_store (& imr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ),
408
- & mr -> mmkey , GFP_ATOMIC );
409
-
410
451
mlx5_ib_dbg (imr -> dev , "key %x mr %p\n" , mr -> mmkey .key , mr );
411
452
return mr ;
412
453
@@ -445,9 +486,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
445
486
imr -> ibmr .lkey = imr -> mmkey .key ;
446
487
imr -> ibmr .rkey = imr -> mmkey .key ;
447
488
imr -> umem = & umem_odp -> umem ;
448
- init_waitqueue_head (& imr -> q_leaf_free );
449
- atomic_set (& imr -> num_leaf_free , 0 );
450
- atomic_set (& imr -> num_pending_prefetch , 0 );
489
+ atomic_set (& imr -> num_deferred_work , 0 );
451
490
xa_init (& imr -> implicit_children );
452
491
453
492
err = mlx5_ib_update_xlt (imr , 0 ,
@@ -477,35 +516,48 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
477
516
void mlx5_ib_free_implicit_mr (struct mlx5_ib_mr * imr )
478
517
{
479
518
struct ib_umem_odp * odp_imr = to_ib_umem_odp (imr -> umem );
519
+ struct mlx5_ib_dev * dev = imr -> dev ;
520
+ struct list_head destroy_list ;
480
521
struct mlx5_ib_mr * mtt ;
522
+ struct mlx5_ib_mr * tmp ;
481
523
unsigned long idx ;
482
524
483
- mutex_lock (& odp_imr -> umem_mutex );
484
- xa_for_each (& imr -> implicit_children , idx , mtt ) {
485
- struct ib_umem_odp * umem_odp = to_ib_umem_odp (mtt -> umem );
525
+ INIT_LIST_HEAD (& destroy_list );
486
526
487
- xa_erase (& imr -> implicit_children , idx );
527
+ xa_erase (& dev -> odp_mkeys , mlx5_base_mkey (imr -> mmkey .key ));
528
+ /*
529
+ * This stops the SRCU protected page fault path from touching either
530
+ * the imr or any children. The page fault path can only reach the
531
+ * children xarray via the imr.
532
+ */
533
+ synchronize_srcu (& dev -> odp_srcu );
488
534
489
- mutex_lock (& umem_odp -> umem_mutex );
490
- ib_umem_odp_unmap_dma_pages (umem_odp , ib_umem_start (umem_odp ),
491
- ib_umem_end (umem_odp ));
535
+ xa_lock (& imr -> implicit_children );
536
+ xa_for_each (& imr -> implicit_children , idx , mtt ) {
537
+ __xa_erase (& imr -> implicit_children , idx );
538
+ __xa_erase (& dev -> odp_mkeys , mlx5_base_mkey (mtt -> mmkey .key ));
539
+ list_add (& mtt -> odp_destroy .elm , & destroy_list );
540
+ }
541
+ xa_unlock (& imr -> implicit_children );
492
542
493
- if (umem_odp -> dying ) {
494
- mutex_unlock (& umem_odp -> umem_mutex );
495
- continue ;
496
- }
543
+ /* Fence access to the child pointers via the pagefault thread */
544
+ synchronize_srcu (& dev -> odp_srcu );
497
545
498
- umem_odp -> dying = 1 ;
499
- atomic_inc (& imr -> num_leaf_free );
500
- schedule_work (& umem_odp -> work );
501
- mutex_unlock (& umem_odp -> umem_mutex );
546
+ /*
547
+ * num_deferred_work can only be incremented inside the odp_srcu, or
548
+ * under xa_lock while the child is in the xarray. Thus at this point
549
+ * it is only decreasing, and all work holding it is now on the wq.
550
+ */
551
+ if (atomic_read (& imr -> num_deferred_work )) {
552
+ flush_workqueue (system_unbound_wq );
553
+ WARN_ON (atomic_read (& imr -> num_deferred_work ));
502
554
}
503
- mutex_unlock (& odp_imr -> umem_mutex );
504
555
505
- wait_event (imr -> q_leaf_free , !atomic_read (& imr -> num_leaf_free ));
506
- WARN_ON (!xa_empty (& imr -> implicit_children ));
507
- /* Remove any left over reserved elements */
508
- xa_destroy (& imr -> implicit_children );
556
+ list_for_each_entry_safe (mtt , tmp , & destroy_list , odp_destroy .elm )
557
+ free_implicit_child_mr (mtt , false);
558
+
559
+ mlx5_mr_cache_free (dev , imr );
560
+ ib_umem_odp_release (odp_imr );
509
561
}
510
562
511
563
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
@@ -1579,7 +1631,7 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
1579
1631
u32 i ;
1580
1632
1581
1633
for (i = 0 ; i < work -> num_sge ; ++ i )
1582
- atomic_dec (& work -> frags [i ].mr -> num_pending_prefetch );
1634
+ atomic_dec (& work -> frags [i ].mr -> num_deferred_work );
1583
1635
kvfree (work );
1584
1636
}
1585
1637
@@ -1658,7 +1710,7 @@ static bool init_prefetch_work(struct ib_pd *pd,
1658
1710
}
1659
1711
1660
1712
/* Keep the MR pointer will valid outside the SRCU */
1661
- atomic_inc (& work -> frags [i ].mr -> num_pending_prefetch );
1713
+ atomic_inc (& work -> frags [i ].mr -> num_deferred_work );
1662
1714
}
1663
1715
work -> num_sge = num_sge ;
1664
1716
return true;
0 commit comments