Skip to content

Commit c7ff819

Browse files
yaminfjgunthorpe
authored andcommitted
RDMA/core: Introduce shared CQ pool API
Allow a ULP to ask the core to provide a completion queue based on a least-used search on a per-device CQ pools. The device CQ pools grow in a lazy fashion when more CQs are requested. This feature reduces the amount of interrupts when using many QPs. Using shared CQs allows for more effcient completion handling. It also reduces the amount of overhead needed for CQ contexts. Test setup: Intel(R) Xeon(R) Platinum 8176M CPU @ 2.10GHz servers. Running NVMeoF 4KB read IOs over ConnectX-5EX across Spectrum switch. TX-depth = 32. The patch was applied in the nvme driver on both the target and initiator. Four controllers are accessed from each core. In the current test case we have exposed sixteen NVMe namespaces using four different subsystems (four namespaces per subsystem) from one NVM port. Each controller allocated X queues (RDMA QPs) and attached to Y CQs. Before this series we had X == Y, i.e for four controllers we've created total of 4X QPs and 4X CQs. In the shared case, we've created 4X QPs and only X CQs which means that we have four controllers that share a completion queue per core. Until fourteen cores there is no significant change in performance and the number of interrupts per second is less than a million in the current case. ================================================== |Cores|Current KIOPs |Shared KIOPs |improvement| |-----|---------------|--------------|-----------| |14 |2332 |2723 |16.7% | |-----|---------------|--------------|-----------| |20 |2086 |2712 |30% | |-----|---------------|--------------|-----------| |28 |1971 |2669 |35.4% | |================================================= |Cores|Current avg lat|Shared avg lat|improvement| |-----|---------------|--------------|-----------| |14 |767us |657us |14.3% | |-----|---------------|--------------|-----------| |20 |1225us |943us |23% | |-----|---------------|--------------|-----------| |28 |1816us |1341us |26.1% | ======================================================== |Cores|Current interrupts|Shared interrupts|improvement| |-----|------------------|-----------------|-----------| |14 |1.6M/sec |0.4M/sec |72% | |-----|------------------|-----------------|-----------| |20 |2.8M/sec |0.6M/sec |72.4% | |-----|------------------|-----------------|-----------| |28 |2.9M/sec |0.8M/sec |63.4% | ==================================================================== |Cores|Current 99.99th PCTL lat|Shared 99.99th PCTL lat|improvement| |-----|------------------------|-----------------------|-----------| |14 |67ms |6ms |90.9% | |-----|------------------------|-----------------------|-----------| |20 |5ms |6ms |-10% | |-----|------------------------|-----------------------|-----------| |28 |8.7ms |6ms |25.9% | |=================================================================== Performance improvement with sixteen disks (sixteen CQs per core) is comparable. Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Yamin Friedman <[email protected]> Reviewed-by: Or Gerlitz <[email protected]> Reviewed-by: Max Gurtovoy <[email protected]> Reviewed-by: Leon Romanovsky <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 3446cbd commit c7ff819

File tree

4 files changed

+194
-1
lines changed

4 files changed

+194
-1
lines changed

drivers/infiniband/core/core_priv.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,4 +414,7 @@ void rdma_umap_priv_init(struct rdma_umap_priv *priv,
414414
struct vm_area_struct *vma,
415415
struct rdma_user_mmap_entry *entry);
416416

417+
void ib_cq_pool_init(struct ib_device *dev);
418+
void ib_cq_pool_destroy(struct ib_device *dev);
419+
417420
#endif /* _CORE_PRIV_H */

drivers/infiniband/core/cq.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
#include <linux/slab.h>
88
#include <rdma/ib_verbs.h>
99

10+
#include "core_priv.h"
11+
1012
#include <trace/events/rdma_core.h>
13+
/* Max size for shared CQ, may require tuning */
14+
#define IB_MAX_SHARED_CQ_SZ 4096U
1115

1216
/* # of WCs to poll for with a single call to ib_poll_cq */
1317
#define IB_POLL_BATCH 16
@@ -218,6 +222,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
218222
cq->cq_context = private;
219223
cq->poll_ctx = poll_ctx;
220224
atomic_set(&cq->usecnt, 0);
225+
cq->comp_vector = comp_vector;
221226

222227
cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
223228
if (!cq->wc)
@@ -309,6 +314,8 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
309314
{
310315
if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
311316
return;
317+
if (WARN_ON_ONCE(cq->cqe_used))
318+
return;
312319

313320
switch (cq->poll_ctx) {
314321
case IB_POLL_DIRECT:
@@ -334,3 +341,169 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
334341
kfree(cq);
335342
}
336343
EXPORT_SYMBOL(ib_free_cq_user);
344+
345+
void ib_cq_pool_init(struct ib_device *dev)
346+
{
347+
unsigned int i;
348+
349+
spin_lock_init(&dev->cq_pools_lock);
350+
for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++)
351+
INIT_LIST_HEAD(&dev->cq_pools[i]);
352+
}
353+
354+
void ib_cq_pool_destroy(struct ib_device *dev)
355+
{
356+
struct ib_cq *cq, *n;
357+
unsigned int i;
358+
359+
for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
360+
list_for_each_entry_safe(cq, n, &dev->cq_pools[i],
361+
pool_entry) {
362+
WARN_ON(cq->cqe_used);
363+
cq->shared = false;
364+
ib_free_cq(cq);
365+
}
366+
}
367+
}
368+
369+
static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
370+
enum ib_poll_context poll_ctx)
371+
{
372+
LIST_HEAD(tmp_list);
373+
unsigned int nr_cqs, i;
374+
struct ib_cq *cq;
375+
int ret;
376+
377+
if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
378+
WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
379+
return -EINVAL;
380+
}
381+
382+
/*
383+
* Allocate at least as many CQEs as requested, and otherwise
384+
* a reasonable batch size so that we can share CQs between
385+
* multiple users instead of allocating a larger number of CQs.
386+
*/
387+
nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
388+
max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
389+
nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
390+
for (i = 0; i < nr_cqs; i++) {
391+
cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
392+
if (IS_ERR(cq)) {
393+
ret = PTR_ERR(cq);
394+
goto out_free_cqs;
395+
}
396+
cq->shared = true;
397+
list_add_tail(&cq->pool_entry, &tmp_list);
398+
}
399+
400+
spin_lock_irq(&dev->cq_pools_lock);
401+
list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
402+
spin_unlock_irq(&dev->cq_pools_lock);
403+
404+
return 0;
405+
406+
out_free_cqs:
407+
list_for_each_entry(cq, &tmp_list, pool_entry) {
408+
cq->shared = false;
409+
ib_free_cq(cq);
410+
}
411+
return ret;
412+
}
413+
414+
/**
415+
* ib_cq_pool_get() - Find the least used completion queue that matches
416+
* a given cpu hint (or least used for wild card affinity) and fits
417+
* nr_cqe.
418+
* @dev: rdma device
419+
* @nr_cqe: number of needed cqe entries
420+
* @comp_vector_hint: completion vector hint (-1) for the driver to assign
421+
* a comp vector based on internal counter
422+
* @poll_ctx: cq polling context
423+
*
424+
* Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
425+
* claim entries in it for us. In case there is no available cq, allocate
426+
* a new cq with the requirements and add it to the device pool.
427+
* IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
428+
* for @poll_ctx.
429+
*/
430+
struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
431+
int comp_vector_hint,
432+
enum ib_poll_context poll_ctx)
433+
{
434+
static unsigned int default_comp_vector;
435+
unsigned int vector, num_comp_vectors;
436+
struct ib_cq *cq, *found = NULL;
437+
int ret;
438+
439+
if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
440+
WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
441+
return ERR_PTR(-EINVAL);
442+
}
443+
444+
num_comp_vectors =
445+
min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
446+
/* Project the affinty to the device completion vector range */
447+
if (comp_vector_hint < 0) {
448+
comp_vector_hint =
449+
(READ_ONCE(default_comp_vector) + 1) % num_comp_vectors;
450+
WRITE_ONCE(default_comp_vector, comp_vector_hint);
451+
}
452+
vector = comp_vector_hint % num_comp_vectors;
453+
454+
/*
455+
* Find the least used CQ with correct affinity and
456+
* enough free CQ entries
457+
*/
458+
while (!found) {
459+
spin_lock_irq(&dev->cq_pools_lock);
460+
list_for_each_entry(cq, &dev->cq_pools[poll_ctx],
461+
pool_entry) {
462+
/*
463+
* Check to see if we have found a CQ with the
464+
* correct completion vector
465+
*/
466+
if (vector != cq->comp_vector)
467+
continue;
468+
if (cq->cqe_used + nr_cqe > cq->cqe)
469+
continue;
470+
found = cq;
471+
break;
472+
}
473+
474+
if (found) {
475+
found->cqe_used += nr_cqe;
476+
spin_unlock_irq(&dev->cq_pools_lock);
477+
478+
return found;
479+
}
480+
spin_unlock_irq(&dev->cq_pools_lock);
481+
482+
/*
483+
* Didn't find a match or ran out of CQs in the device
484+
* pool, allocate a new array of CQs.
485+
*/
486+
ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
487+
if (ret)
488+
return ERR_PTR(ret);
489+
}
490+
491+
return found;
492+
}
493+
EXPORT_SYMBOL(ib_cq_pool_get);
494+
495+
/**
496+
* ib_cq_pool_put - Return a CQ taken from a shared pool.
497+
* @cq: The CQ to return.
498+
* @nr_cqe: The max number of cqes that the user had requested.
499+
*/
500+
void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe)
501+
{
502+
if (WARN_ON_ONCE(nr_cqe > cq->cqe_used))
503+
return;
504+
505+
spin_lock_irq(&cq->device->cq_pools_lock);
506+
cq->cqe_used -= nr_cqe;
507+
spin_unlock_irq(&cq->device->cq_pools_lock);
508+
}
509+
EXPORT_SYMBOL(ib_cq_pool_put);

drivers/infiniband/core/device.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1393,6 +1393,7 @@ int ib_register_device(struct ib_device *device, const char *name)
13931393
goto dev_cleanup;
13941394
}
13951395

1396+
ib_cq_pool_init(device);
13961397
ret = enable_device_and_get(device);
13971398
dev_set_uevent_suppress(&device->dev, false);
13981399
/* Mark for userspace that device is ready */
@@ -1447,6 +1448,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
14471448
goto out;
14481449

14491450
disable_device(ib_dev);
1451+
ib_cq_pool_destroy(ib_dev);
14501452

14511453
/* Expedite removing unregistered pointers from the hash table */
14521454
free_netdevs(ib_dev);

include/rdma/ib_verbs.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1588,10 +1588,12 @@ struct ib_ah {
15881588
typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
15891589

15901590
enum ib_poll_context {
1591-
IB_POLL_DIRECT, /* caller context, no hw completions */
15921591
IB_POLL_SOFTIRQ, /* poll from softirq context */
15931592
IB_POLL_WORKQUEUE, /* poll from workqueue */
15941593
IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */
1594+
IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE,
1595+
1596+
IB_POLL_DIRECT, /* caller context, no hw completions */
15951597
};
15961598

15971599
struct ib_cq {
@@ -1601,9 +1603,11 @@ struct ib_cq {
16011603
void (*event_handler)(struct ib_event *, void *);
16021604
void *cq_context;
16031605
int cqe;
1606+
unsigned int cqe_used;
16041607
atomic_t usecnt; /* count number of work queues */
16051608
enum ib_poll_context poll_ctx;
16061609
struct ib_wc *wc;
1610+
struct list_head pool_entry;
16071611
union {
16081612
struct irq_poll iop;
16091613
struct work_struct work;
@@ -1615,6 +1619,7 @@ struct ib_cq {
16151619
ktime_t timestamp;
16161620
u8 interrupt:1;
16171621
u8 shared:1;
1622+
unsigned int comp_vector;
16181623

16191624
/*
16201625
* Implementation details of the RDMA core, don't use in drivers:
@@ -2734,6 +2739,10 @@ struct ib_device {
27342739
#endif
27352740

27362741
u32 index;
2742+
2743+
spinlock_t cq_pools_lock;
2744+
struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1];
2745+
27372746
struct rdma_restrack_root *res;
27382747

27392748
const struct uapi_definition *driver_def;
@@ -4037,6 +4046,12 @@ static inline int ib_req_notify_cq(struct ib_cq *cq,
40374046
return cq->device->ops.req_notify_cq(cq, flags);
40384047
}
40394048

4049+
struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
4050+
int comp_vector_hint,
4051+
enum ib_poll_context poll_ctx);
4052+
4053+
void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe);
4054+
40404055
/**
40414056
* ib_req_ncomp_notif - Request completion notification when there are
40424057
* at least the specified number of unreaped completions on the CQ.

0 commit comments

Comments
 (0)