zephyrproject-rtos · jgl-meta · May 26, 2023 · May 26, 2023 · May 26, 2023 · May 26, 2023
diff --git a/arch/arc/include/swap_macros.h b/arch/arc/include/swap_macros.h
@@ -416,11 +416,11 @@ fpu_skip_load :
 .macro _store_old_thread_callee_regs
 
 	_save_callee_saved_regs
-	/* Save old thread into switch handle which is required by wait_for_switch.
+	/* Save old thread into switch handle which is required by z_sched_switch_spin.
 	 * NOTE: we shouldn't save anything related to old thread context after this point!
 	 * TODO: we should add SMP write-after-write data memory barrier here, as we want all
 	 * previous writes completed before setting switch_handle which is polled by other cores
-	 * in wait_for_switch in case of SMP. Though it's not likely that this issue
+	 * in z_sched_switch_spin in case of SMP. Though it's not likely that this issue
 	 * will reproduce in real world as there is some gap before reading switch_handle and
 	 * reading rest of the data we've stored before.
 	 */

diff --git a/arch/arm64/core/switch.S b/arch/arm64/core/switch.S
@@ -75,7 +75,7 @@ SECTION_FUNC(TEXT, z_arm64_context_switch)
 #endif
 
 	/* save old thread into switch handle which is required by
-	 * wait_for_switch
+	 * z_sched_switch_spin()
 	 */
 	 str	x1, [x1, #___thread_t_switch_handle_OFFSET]
 

diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h
@@ -8,6 +8,7 @@
 
 #include <ksched.h>
 #include <zephyr/spinlock.h>
+#include <zephyr/sys/barrier.h>
 #include <kernel_arch_func.h>
 
 #ifdef CONFIG_STACK_SENTINEL
@@ -30,23 +31,38 @@ void z_smp_release_global_lock(struct k_thread *thread);
 /* context switching and scheduling-related routines */
 #ifdef CONFIG_USE_SWITCH
 
-/* There is an unavoidable SMP race when threads swap -- their thread
- * record is in the queue (and visible to other CPUs) before
- * arch_switch() finishes saving state.  We must spin for the switch
- * handle before entering a new thread.  See docs on arch_switch().
+/* Spin, with the scheduler lock held (!), on a thread that is known
+ * (!!) to have released the lock and be on a path where it will
+ * deterministically (!!!) reach arch_switch() in very small constant
+ * time.
  *
- * Note: future SMP architectures may need a fence/barrier or cache
- * invalidation here.  Current ones don't, and sadly Zephyr doesn't
- * have a framework for that yet.
+ * This exists to treat an unavoidable SMP race when threads swap --
+ * their thread record is in the queue (and visible to other CPUs)
+ * before arch_switch() finishes saving state.  We must spin for the
+ * switch handle before entering a new thread.  See docs on
+ * arch_switch().
+ *
+ * Stated differently: there's a chicken and egg bug with the question
+ * of "is a thread running or not?".  The thread needs to mark itself
+ * "not running" from its own context, but at that moment it obviously
+ * is still running until it reaches arch_switch()!  Locking can't
+ * treat this because the scheduler lock can't be released by the
+ * switched-to thread, which is going to (obviously) be running its
+ * own code and doesn't know it was switched out.
  */
-static inline void wait_for_switch(struct k_thread *thread)
+static inline void z_sched_switch_spin(struct k_thread *thread)
 {
 #ifdef CONFIG_SMP
 	volatile void **shp = (void *)&thread->switch_handle;
 
 	while (*shp == NULL) {
-		k_busy_wait(1);
+		arch_spin_relax();
 	}
+	/* Read barrier: don't allow any subsequent loads in the
+	 * calling code to reorder before we saw switch_handle go
+	 * non-null.
+	 */
+	barrier_dmem_fence_full();
 #endif
 }
 
@@ -117,7 +133,7 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 		}
 #endif
 		z_thread_mark_switched_out();
-		wait_for_switch(new_thread);
+		z_sched_switch_spin(new_thread);
 		_current_cpu->current = new_thread;
 
 #ifdef CONFIG_TIMESLICING
@@ -131,18 +147,21 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 		arch_cohere_stacks(old_thread, NULL, new_thread);
 
 #ifdef CONFIG_SMP
-		/* Add _current back to the run queue HERE. After
-		 * wait_for_switch() we are guaranteed to reach the
-		 * context switch in finite time, avoiding a potential
-		 * deadlock.
+		/* Now add _current back to the run queue, once we are
+		 * guaranteed to reach the context switch in finite
+		 * time.  See z_sched_switch_spin().
 		 */
 		z_requeue_current(old_thread);
 #endif
 		void *newsh = new_thread->switch_handle;
 
 		if (IS_ENABLED(CONFIG_SMP)) {
-			/* Active threads MUST have a null here */
+			/* Active threads must have a null here.  And
+			 * it must be seen before the scheduler lock
+			 * is released!
+			 */
 			new_thread->switch_handle = NULL;
+			barrier_dmem_fence_full(); /* write barrier */
 		}
 		k_spin_release(&sched_spinlock);
 		arch_switch(newsh, &old_thread->switch_handle);
@@ -178,6 +197,11 @@ static inline void z_swap_unlocked(void)
 
 extern int arch_swap(unsigned int key);
 
+static inline void z_sched_switch_spin(struct k_thread *thread)
+{
+	ARG_UNUSED(thread);
+}
+
 static inline int z_swap_irqlock(unsigned int key)
 {
 	int ret;

diff --git a/kernel/sched.c b/kernel/sched.c
@@ -1095,7 +1095,7 @@ void *z_get_next_switch_handle(void *interrupted)
 
 		if (old_thread != new_thread) {
 			update_metairq_preempt(new_thread);
-			wait_for_switch(new_thread);
+			z_sched_switch_spin(new_thread);
 			arch_cohere_stacks(old_thread, interrupted, new_thread);
 
 			_current_cpu->swap_ok = 0;
@@ -1771,6 +1771,13 @@ void z_thread_abort(struct k_thread *thread)
 			k_spin_unlock(&sched_spinlock, key);
 			while (is_aborting(thread)) {
 			}
+
+			/* Now we know it's dying, but not necessarily
+			 * dead.  Wait for the switch to happen!
+			 */
+			key = k_spin_lock(&sched_spinlock);
+			z_sched_switch_spin(thread);
+			k_spin_unlock(&sched_spinlock, key);
 		} else if (active) {
 			/* Threads can join */
 			add_to_waitq_locked(_current, &thread->join_queue);
@@ -1806,6 +1813,7 @@ int z_impl_k_thread_join(struct k_thread *thread, k_timeout_t timeout)
 	SYS_PORT_TRACING_OBJ_FUNC_ENTER(k_thread, join, thread, timeout);
 
 	if ((thread->base.thread_state & _THREAD_DEAD) != 0U) {
+		z_sched_switch_spin(thread);
 		ret = 0;
 	} else if (K_TIMEOUT_EQ(timeout, K_NO_WAIT)) {
 		ret = -EBUSY;