diff --git a/arch/arc/include/swap_macros.h b/arch/arc/include/swap_macros.h index 2328ce31347a..adb3b8cfbdee 100644 --- a/arch/arc/include/swap_macros.h +++ b/arch/arc/include/swap_macros.h @@ -416,11 +416,11 @@ fpu_skip_load : .macro _store_old_thread_callee_regs _save_callee_saved_regs - /* Save old thread into switch handle which is required by wait_for_switch. + /* Save old thread into switch handle which is required by z_sched_switch_spin. * NOTE: we shouldn't save anything related to old thread context after this point! * TODO: we should add SMP write-after-write data memory barrier here, as we want all * previous writes completed before setting switch_handle which is polled by other cores - * in wait_for_switch in case of SMP. Though it's not likely that this issue + * in z_sched_switch_spin in case of SMP. Though it's not likely that this issue * will reproduce in real world as there is some gap before reading switch_handle and * reading rest of the data we've stored before. */ diff --git a/arch/arm64/core/switch.S b/arch/arm64/core/switch.S index cb721cec9e9c..142103dee2f9 100644 --- a/arch/arm64/core/switch.S +++ b/arch/arm64/core/switch.S @@ -75,7 +75,7 @@ SECTION_FUNC(TEXT, z_arm64_context_switch) #endif /* save old thread into switch handle which is required by - * wait_for_switch + * z_sched_switch_spin() */ str x1, [x1, #___thread_t_switch_handle_OFFSET] diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h index 767e97908939..fa1e538b8e81 100644 --- a/kernel/include/kswap.h +++ b/kernel/include/kswap.h @@ -8,6 +8,7 @@ #include #include +#include #include #ifdef CONFIG_STACK_SENTINEL @@ -30,23 +31,38 @@ void z_smp_release_global_lock(struct k_thread *thread); /* context switching and scheduling-related routines */ #ifdef CONFIG_USE_SWITCH -/* There is an unavoidable SMP race when threads swap -- their thread - * record is in the queue (and visible to other CPUs) before - * arch_switch() finishes saving state. We must spin for the switch - * handle before entering a new thread. See docs on arch_switch(). +/* Spin, with the scheduler lock held (!), on a thread that is known + * (!!) to have released the lock and be on a path where it will + * deterministically (!!!) reach arch_switch() in very small constant + * time. * - * Note: future SMP architectures may need a fence/barrier or cache - * invalidation here. Current ones don't, and sadly Zephyr doesn't - * have a framework for that yet. + * This exists to treat an unavoidable SMP race when threads swap -- + * their thread record is in the queue (and visible to other CPUs) + * before arch_switch() finishes saving state. We must spin for the + * switch handle before entering a new thread. See docs on + * arch_switch(). + * + * Stated differently: there's a chicken and egg bug with the question + * of "is a thread running or not?". The thread needs to mark itself + * "not running" from its own context, but at that moment it obviously + * is still running until it reaches arch_switch()! Locking can't + * treat this because the scheduler lock can't be released by the + * switched-to thread, which is going to (obviously) be running its + * own code and doesn't know it was switched out. */ -static inline void wait_for_switch(struct k_thread *thread) +static inline void z_sched_switch_spin(struct k_thread *thread) { #ifdef CONFIG_SMP volatile void **shp = (void *)&thread->switch_handle; while (*shp == NULL) { - k_busy_wait(1); + arch_spin_relax(); } + /* Read barrier: don't allow any subsequent loads in the + * calling code to reorder before we saw switch_handle go + * non-null. + */ + barrier_dmem_fence_full(); #endif } @@ -117,7 +133,7 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key, } #endif z_thread_mark_switched_out(); - wait_for_switch(new_thread); + z_sched_switch_spin(new_thread); _current_cpu->current = new_thread; #ifdef CONFIG_TIMESLICING @@ -131,18 +147,21 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key, arch_cohere_stacks(old_thread, NULL, new_thread); #ifdef CONFIG_SMP - /* Add _current back to the run queue HERE. After - * wait_for_switch() we are guaranteed to reach the - * context switch in finite time, avoiding a potential - * deadlock. + /* Now add _current back to the run queue, once we are + * guaranteed to reach the context switch in finite + * time. See z_sched_switch_spin(). */ z_requeue_current(old_thread); #endif void *newsh = new_thread->switch_handle; if (IS_ENABLED(CONFIG_SMP)) { - /* Active threads MUST have a null here */ + /* Active threads must have a null here. And + * it must be seen before the scheduler lock + * is released! + */ new_thread->switch_handle = NULL; + barrier_dmem_fence_full(); /* write barrier */ } k_spin_release(&sched_spinlock); arch_switch(newsh, &old_thread->switch_handle); @@ -178,6 +197,11 @@ static inline void z_swap_unlocked(void) extern int arch_swap(unsigned int key); +static inline void z_sched_switch_spin(struct k_thread *thread) +{ + ARG_UNUSED(thread); +} + static inline int z_swap_irqlock(unsigned int key) { int ret; diff --git a/kernel/sched.c b/kernel/sched.c index a13f94bf3aea..2498a0727db4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1095,7 +1095,7 @@ void *z_get_next_switch_handle(void *interrupted) if (old_thread != new_thread) { update_metairq_preempt(new_thread); - wait_for_switch(new_thread); + z_sched_switch_spin(new_thread); arch_cohere_stacks(old_thread, interrupted, new_thread); _current_cpu->swap_ok = 0; @@ -1771,6 +1771,13 @@ void z_thread_abort(struct k_thread *thread) k_spin_unlock(&sched_spinlock, key); while (is_aborting(thread)) { } + + /* Now we know it's dying, but not necessarily + * dead. Wait for the switch to happen! + */ + key = k_spin_lock(&sched_spinlock); + z_sched_switch_spin(thread); + k_spin_unlock(&sched_spinlock, key); } else if (active) { /* Threads can join */ add_to_waitq_locked(_current, &thread->join_queue); @@ -1806,6 +1813,7 @@ int z_impl_k_thread_join(struct k_thread *thread, k_timeout_t timeout) SYS_PORT_TRACING_OBJ_FUNC_ENTER(k_thread, join, thread, timeout); if ((thread->base.thread_state & _THREAD_DEAD) != 0U) { + z_sched_switch_spin(thread); ret = 0; } else if (K_TIMEOUT_EQ(timeout, K_NO_WAIT)) { ret = -EBUSY;