Skip to content

Commit 3a83f67

Browse files
mdrothmpe
authored andcommitted
KVM: PPC: Book3S HV: use smp_mb() when setting/clearing host_ipi flag
On a 2-socket Power9 system with 32 cores/128 threads (SMT4) and 1TB of memory running the following guest configs: guest A: - 224GB of memory - 56 VCPUs (sockets=1,cores=28,threads=2), where: VCPUs 0-1 are pinned to CPUs 0-3, VCPUs 2-3 are pinned to CPUs 4-7, ... VCPUs 54-55 are pinned to CPUs 108-111 guest B: - 4GB of memory - 4 VCPUs (sockets=1,cores=4,threads=1) with the following workloads (with KSM and THP enabled in all): guest A: stress --cpu 40 --io 20 --vm 20 --vm-bytes 512M guest B: stress --cpu 4 --io 4 --vm 4 --vm-bytes 512M host: stress --cpu 4 --io 4 --vm 2 --vm-bytes 256M the below soft-lockup traces were observed after an hour or so and persisted until the host was reset (this was found to be reliably reproducible for this configuration, for kernels 4.15, 4.18, 5.0, and 5.3-rc5): [ 1253.183290] rcu: INFO: rcu_sched self-detected stall on CPU [ 1253.183319] rcu: 124-....: (5250 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=1941 [ 1256.287426] watchdog: BUG: soft lockup - CPU#105 stuck for 23s! [CPU 52/KVM:19709] [ 1264.075773] watchdog: BUG: soft lockup - CPU#24 stuck for 23s! [worker:19913] [ 1264.079769] watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [worker:20331] [ 1264.095770] watchdog: BUG: soft lockup - CPU#45 stuck for 23s! [worker:20338] [ 1264.131773] watchdog: BUG: soft lockup - CPU#64 stuck for 23s! [avocado:19525] [ 1280.408480] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1316.198012] rcu: INFO: rcu_sched self-detected stall on CPU [ 1316.198032] rcu: 124-....: (21003 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=8243 [ 1340.411024] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1379.212609] rcu: INFO: rcu_sched self-detected stall on CPU [ 1379.212629] rcu: 124-....: (36756 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=14714 [ 1404.413615] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1442.227095] rcu: INFO: rcu_sched self-detected stall on CPU [ 1442.227115] rcu: 124-....: (52509 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=21403 [ 1455.111787] INFO: task worker:19907 blocked for more than 120 seconds. [ 1455.111822] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111833] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.111884] INFO: task worker:19908 blocked for more than 120 seconds. [ 1455.111905] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111925] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.111966] INFO: task worker:20328 blocked for more than 120 seconds. [ 1455.111986] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111998] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112048] INFO: task worker:20330 blocked for more than 120 seconds. [ 1455.112068] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112097] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112138] INFO: task worker:20332 blocked for more than 120 seconds. [ 1455.112159] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112179] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112210] INFO: task worker:20333 blocked for more than 120 seconds. [ 1455.112231] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112242] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112282] INFO: task worker:20335 blocked for more than 120 seconds. [ 1455.112303] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112332] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112372] INFO: task worker:20336 blocked for more than 120 seconds. [ 1455.112392] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 CPUs 45, 24, and 124 are stuck on spin locks, likely held by CPUs 105 and 31. CPUs 105 and 31 are stuck in smp_call_function_many(), waiting on target CPU 42. For instance: # CPU 105 registers (via xmon) R00 = c00000000020b20c R16 = 00007d1bcd800000 R01 = c00000363eaa7970 R17 = 0000000000000001 R02 = c0000000019b3a00 R18 = 000000000000006b R03 = 000000000000002a R19 = 00007d537d7aecf0 R04 = 000000000000002a R20 = 60000000000000e0 R05 = 000000000000002a R21 = 0801000000000080 R06 = c0002073fb0caa08 R22 = 0000000000000d60 R07 = c0000000019ddd78 R23 = 0000000000000001 R08 = 000000000000002a R24 = c00000000147a700 R09 = 0000000000000001 R25 = c0002073fb0ca908 R10 = c000008ffeb4e660 R26 = 0000000000000000 R11 = c0002073fb0ca900 R27 = c0000000019e2464 R12 = c000000000050790 R28 = c0000000000812b0 R13 = c000207fff623e00 R29 = c0002073fb0ca808 R14 = 00007d1bbee00000 R30 = c0002073fb0ca800 R15 = 00007d1bcd600000 R31 = 0000000000000800 pc = c00000000020b260 smp_call_function_many+0x3d0/0x460 cfar= c00000000020b270 smp_call_function_many+0x3e0/0x460 lr = c00000000020b20c smp_call_function_many+0x37c/0x460 msr = 900000010288b033 cr = 44024824 ctr = c000000000050790 xer = 0000000000000000 trap = 100 CPU 42 is running normally, doing VCPU work: # CPU 42 stack trace (via xmon) [link register ] c00800001be17188 kvmppc_book3s_radix_page_fault+0x90/0x2b0 [kvm_hv] [c000008ed3343820] c000008ed3343850 (unreliable) [c000008ed33438d0] c00800001be11b6c kvmppc_book3s_hv_page_fault+0x264/0xe30 [kvm_hv] [c000008ed33439d0] c00800001be0d7b4 kvmppc_vcpu_run_hv+0x8dc/0xb50 [kvm_hv] [c000008ed3343ae0] c00800001c10891c kvmppc_vcpu_run+0x34/0x48 [kvm] [c000008ed3343b00] c00800001c10475c kvm_arch_vcpu_ioctl_run+0x244/0x420 [kvm] [c000008ed3343b90] c00800001c0f5a78 kvm_vcpu_ioctl+0x470/0x7c8 [kvm] [c000008ed3343d00] c000000000475450 do_vfs_ioctl+0xe0/0xc70 [c000008ed3343db0] c0000000004760e4 ksys_ioctl+0x104/0x120 [c000008ed3343e00] c000000000476128 sys_ioctl+0x28/0x80 [c000008ed3343e20] c00000000000b388 system_call+0x5c/0x70 --- Exception: c00 (System Call) at 00007d545cfd7694 SP (7d53ff7edf50) is in userspace It was subsequently found that ipi_message[PPC_MSG_CALL_FUNCTION] was set for CPU 42 by at least 1 of the CPUs waiting in smp_call_function_many(), but somehow the corresponding call_single_queue entries were never processed by CPU 42, causing the callers to spin in csd_lock_wait() indefinitely. Nick Piggin suggested something similar to the following sequence as a possible explanation (interleaving of CALL_FUNCTION/RESCHEDULE IPI messages seems to be most common, but any mix of CALL_FUNCTION and !CALL_FUNCTION messages could trigger it): CPU X: smp_muxed_ipi_set_message(): X: smp_mb() X: message[RESCHEDULE] = 1 X: doorbell_global_ipi(42): X: kvmppc_set_host_ipi(42, 1) X: ppc_msgsnd_sync()/smp_mb() X: ppc_msgsnd() -> 42 42: doorbell_exception(): // from CPU X 42: ppc_msgsync() 105: smp_muxed_ipi_set_message(): 105: smb_mb() // STORE DEFERRED DUE TO RE-ORDERING --105: message[CALL_FUNCTION] = 1 | 105: doorbell_global_ipi(42): | 105: kvmppc_set_host_ipi(42, 1) | 42: kvmppc_set_host_ipi(42, 0) | 42: smp_ipi_demux_relaxed() | 42: // returns to executing guest | // RE-ORDERED STORE COMPLETES ->105: message[CALL_FUNCTION] = 1 105: ppc_msgsnd_sync()/smp_mb() 105: ppc_msgsnd() -> 42 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored 105: // hangs waiting on 42 to process messages/call_single_queue This can be prevented with an smp_mb() at the beginning of kvmppc_set_host_ipi(), such that stores to message[<type>] (or other state indicated by the host_ipi flag) are ordered vs. the store to to host_ipi. However, doing so might still allow for the following scenario (not yet observed): CPU X: smp_muxed_ipi_set_message(): X: smp_mb() X: message[RESCHEDULE] = 1 X: doorbell_global_ipi(42): X: kvmppc_set_host_ipi(42, 1) X: ppc_msgsnd_sync()/smp_mb() X: ppc_msgsnd() -> 42 42: doorbell_exception(): // from CPU X 42: ppc_msgsync() // STORE DEFERRED DUE TO RE-ORDERING -- 42: kvmppc_set_host_ipi(42, 0) | 42: smp_ipi_demux_relaxed() | 105: smp_muxed_ipi_set_message(): | 105: smb_mb() | 105: message[CALL_FUNCTION] = 1 | 105: doorbell_global_ipi(42): | 105: kvmppc_set_host_ipi(42, 1) | // RE-ORDERED STORE COMPLETES -> 42: kvmppc_set_host_ipi(42, 0) 42: // returns to executing guest 105: ppc_msgsnd_sync()/smp_mb() 105: ppc_msgsnd() -> 42 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored 105: // hangs waiting on 42 to process messages/call_single_queue Fixing this scenario would require an smp_mb() *after* clearing host_ipi flag in kvmppc_set_host_ipi() to order the store vs. subsequent processing of IPI messages. To handle both cases, this patch splits kvmppc_set_host_ipi() into separate set/clear functions, where we execute smp_mb() prior to setting host_ipi flag, and after clearing host_ipi flag. These functions pair with each other to synchronize the sender and receiver sides. With that change in place the above workload ran for 20 hours without triggering any lock-ups. Fixes: 755563b ("powerpc/powernv: Fixes for hypervisor doorbell handling") # v4.0 Signed-off-by: Michael Roth <[email protected]> Acked-by: Paul Mackerras <[email protected]> Signed-off-by: Michael Ellerman <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent cbd1899 commit 3a83f67

File tree

6 files changed

+108
-14
lines changed

6 files changed

+108
-14
lines changed

arch/powerpc/include/asm/kvm_ppc.h

Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -452,9 +452,100 @@ static inline u32 kvmppc_get_xics_latch(void)
452452
return xirr;
453453
}
454454

455-
static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
455+
/*
456+
* To avoid the need to unnecessarily exit fully to the host kernel, an IPI to
457+
* a CPU thread that's running/napping inside of a guest is by default regarded
458+
* as a request to wake the CPU (if needed) and continue execution within the
459+
* guest, potentially to process new state like externally-generated
460+
* interrupts or IPIs sent from within the guest itself (e.g. H_PROD/H_IPI).
461+
*
462+
* To force an exit to the host kernel, kvmppc_set_host_ipi() must be called
463+
* prior to issuing the IPI to set the corresponding 'host_ipi' flag in the
464+
* target CPU's PACA. To avoid unnecessary exits to the host, this flag should
465+
* be immediately cleared via kvmppc_clear_host_ipi() by the IPI handler on
466+
* the receiving side prior to processing the IPI work.
467+
*
468+
* NOTE:
469+
*
470+
* We currently issue an smp_mb() at the beginning of kvmppc_set_host_ipi().
471+
* This is to guard against sequences such as the following:
472+
*
473+
* CPU
474+
* X: smp_muxed_ipi_set_message():
475+
* X: smp_mb()
476+
* X: message[RESCHEDULE] = 1
477+
* X: doorbell_global_ipi(42):
478+
* X: kvmppc_set_host_ipi(42)
479+
* X: ppc_msgsnd_sync()/smp_mb()
480+
* X: ppc_msgsnd() -> 42
481+
* 42: doorbell_exception(): // from CPU X
482+
* 42: ppc_msgsync()
483+
* 105: smp_muxed_ipi_set_message():
484+
* 105: smb_mb()
485+
* // STORE DEFERRED DUE TO RE-ORDERING
486+
* --105: message[CALL_FUNCTION] = 1
487+
* | 105: doorbell_global_ipi(42):
488+
* | 105: kvmppc_set_host_ipi(42)
489+
* | 42: kvmppc_clear_host_ipi(42)
490+
* | 42: smp_ipi_demux_relaxed()
491+
* | 42: // returns to executing guest
492+
* | // RE-ORDERED STORE COMPLETES
493+
* ->105: message[CALL_FUNCTION] = 1
494+
* 105: ppc_msgsnd_sync()/smp_mb()
495+
* 105: ppc_msgsnd() -> 42
496+
* 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
497+
* 105: // hangs waiting on 42 to process messages/call_single_queue
498+
*
499+
* We also issue an smp_mb() at the end of kvmppc_clear_host_ipi(). This is
500+
* to guard against sequences such as the following (as well as to create
501+
* a read-side pairing with the barrier in kvmppc_set_host_ipi()):
502+
*
503+
* CPU
504+
* X: smp_muxed_ipi_set_message():
505+
* X: smp_mb()
506+
* X: message[RESCHEDULE] = 1
507+
* X: doorbell_global_ipi(42):
508+
* X: kvmppc_set_host_ipi(42)
509+
* X: ppc_msgsnd_sync()/smp_mb()
510+
* X: ppc_msgsnd() -> 42
511+
* 42: doorbell_exception(): // from CPU X
512+
* 42: ppc_msgsync()
513+
* // STORE DEFERRED DUE TO RE-ORDERING
514+
* -- 42: kvmppc_clear_host_ipi(42)
515+
* | 42: smp_ipi_demux_relaxed()
516+
* | 105: smp_muxed_ipi_set_message():
517+
* | 105: smb_mb()
518+
* | 105: message[CALL_FUNCTION] = 1
519+
* | 105: doorbell_global_ipi(42):
520+
* | 105: kvmppc_set_host_ipi(42)
521+
* | // RE-ORDERED STORE COMPLETES
522+
* -> 42: kvmppc_clear_host_ipi(42)
523+
* 42: // returns to executing guest
524+
* 105: ppc_msgsnd_sync()/smp_mb()
525+
* 105: ppc_msgsnd() -> 42
526+
* 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
527+
* 105: // hangs waiting on 42 to process messages/call_single_queue
528+
*/
529+
static inline void kvmppc_set_host_ipi(int cpu)
456530
{
457-
paca_ptrs[cpu]->kvm_hstate.host_ipi = host_ipi;
531+
/*
532+
* order stores of IPI messages vs. setting of host_ipi flag
533+
*
534+
* pairs with the barrier in kvmppc_clear_host_ipi()
535+
*/
536+
smp_mb();
537+
paca_ptrs[cpu]->kvm_hstate.host_ipi = 1;
538+
}
539+
540+
static inline void kvmppc_clear_host_ipi(int cpu)
541+
{
542+
paca_ptrs[cpu]->kvm_hstate.host_ipi = 0;
543+
/*
544+
* order clearing of host_ipi flag vs. processing of IPI messages
545+
*
546+
* pairs with the barrier in kvmppc_set_host_ipi()
547+
*/
548+
smp_mb();
458549
}
459550

460551
static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -486,7 +577,10 @@ static inline u32 kvmppc_get_xics_latch(void)
486577
return 0;
487578
}
488579

489-
static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
580+
static inline void kvmppc_set_host_ipi(int cpu)
581+
{}
582+
583+
static inline void kvmppc_clear_host_ipi(int cpu)
490584
{}
491585

492586
static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)

arch/powerpc/kernel/dbell.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ void doorbell_global_ipi(int cpu)
3333
{
3434
u32 tag = get_hard_smp_processor_id(cpu);
3535

36-
kvmppc_set_host_ipi(cpu, 1);
36+
kvmppc_set_host_ipi(cpu);
3737
/* Order previous accesses vs. msgsnd, which is treated as a store */
3838
ppc_msgsnd_sync();
3939
ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
@@ -48,7 +48,7 @@ void doorbell_core_ipi(int cpu)
4848
{
4949
u32 tag = cpu_thread_in_core(cpu);
5050

51-
kvmppc_set_host_ipi(cpu, 1);
51+
kvmppc_set_host_ipi(cpu);
5252
/* Order previous accesses vs. msgsnd, which is treated as a store */
5353
ppc_msgsnd_sync();
5454
ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
@@ -84,7 +84,7 @@ void doorbell_exception(struct pt_regs *regs)
8484

8585
may_hard_irq_enable();
8686

87-
kvmppc_set_host_ipi(smp_processor_id(), 0);
87+
kvmppc_clear_host_ipi(smp_processor_id());
8888
__this_cpu_inc(irq_stat.doorbell_irqs);
8989

9090
smp_ipi_demux_relaxed(); /* already performed the barrier */

arch/powerpc/kvm/book3s_hv_rm_xics.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
5858
hcpu = hcore << threads_shift;
5959
kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
6060
smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
61-
kvmppc_set_host_ipi(hcpu, 1);
61+
kvmppc_set_host_ipi(hcpu);
6262
smp_mb();
6363
kvmhv_rm_send_ipi(hcpu);
6464
}

arch/powerpc/platforms/powernv/smp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ static void pnv_smp_cpu_kill_self(void)
193193
* for coming online, which are handled via
194194
* generic_check_cpu_restart() calls.
195195
*/
196-
kvmppc_set_host_ipi(cpu, 0);
196+
kvmppc_clear_host_ipi(cpu);
197197

198198
srr1 = pnv_cpu_offline(cpu);
199199

arch/powerpc/sysdev/xics/icp-native.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ static unsigned int icp_native_get_irq(void)
140140

141141
static void icp_native_cause_ipi(int cpu)
142142
{
143-
kvmppc_set_host_ipi(cpu, 1);
143+
kvmppc_set_host_ipi(cpu);
144144
icp_native_set_qirr(cpu, IPI_PRIORITY);
145145
}
146146

@@ -179,7 +179,7 @@ void icp_native_flush_interrupt(void)
179179
if (vec == XICS_IPI) {
180180
/* Clear pending IPI */
181181
int cpu = smp_processor_id();
182-
kvmppc_set_host_ipi(cpu, 0);
182+
kvmppc_clear_host_ipi(cpu);
183183
icp_native_set_qirr(cpu, 0xff);
184184
} else {
185185
pr_err("XICS: hw interrupt 0x%x to offline cpu, disabling\n",
@@ -200,7 +200,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
200200
{
201201
int cpu = smp_processor_id();
202202

203-
kvmppc_set_host_ipi(cpu, 0);
203+
kvmppc_clear_host_ipi(cpu);
204204
icp_native_set_qirr(cpu, 0xff);
205205

206206
return smp_ipi_demux();

arch/powerpc/sysdev/xics/icp-opal.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,15 +126,15 @@ static void icp_opal_cause_ipi(int cpu)
126126
{
127127
int hw_cpu = get_hard_smp_processor_id(cpu);
128128

129-
kvmppc_set_host_ipi(cpu, 1);
129+
kvmppc_set_host_ipi(cpu);
130130
opal_int_set_mfrr(hw_cpu, IPI_PRIORITY);
131131
}
132132

133133
static irqreturn_t icp_opal_ipi_action(int irq, void *dev_id)
134134
{
135135
int cpu = smp_processor_id();
136136

137-
kvmppc_set_host_ipi(cpu, 0);
137+
kvmppc_clear_host_ipi(cpu);
138138
opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff);
139139

140140
return smp_ipi_demux();
@@ -157,7 +157,7 @@ void icp_opal_flush_interrupt(void)
157157
if (vec == XICS_IPI) {
158158
/* Clear pending IPI */
159159
int cpu = smp_processor_id();
160-
kvmppc_set_host_ipi(cpu, 0);
160+
kvmppc_clear_host_ipi(cpu);
161161
opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff);
162162
} else {
163163
pr_err("XICS: hw interrupt 0x%x to offline cpu, "

0 commit comments

Comments
 (0)