kernel/sched: Rename/redocument wait_for_switch() -> z_sched_switch_spin()

andyross · jgl-meta · commit b89e427bd6fd · 2023-05-26T17:09:35.000-04:00
This trick turns out also to be needed by the abort/join code.
Promote it to a more formal-looking internal API and clean up the
documentation to (hopefully) clarify the exact behavior and better
explain the need.

This is one of the more... enchanted bits of the scheduler, and while
the trick is IMHO pretty clean, it remains a big SMP footgun.

Signed-off-by: Andy Ross &lt;andyross@google.com&gt;
diff --git a/arch/arc/include/swap_macros.h b/arch/arc/include/swap_macros.h
@@ -416,11 +416,11 @@ fpu_skip_load :
 .macro _store_old_thread_callee_regs
 
 	_save_callee_saved_regs
-	/* Save old thread into switch handle which is required by wait_for_switch.
+	/* Save old thread into switch handle which is required by z_sched_switch_spin.
 	 * NOTE: we shouldn't save anything related to old thread context after this point!
 	 * TODO: we should add SMP write-after-write data memory barrier here, as we want all
 	 * previous writes completed before setting switch_handle which is polled by other cores
-	 * in wait_for_switch in case of SMP. Though it's not likely that this issue
+	 * in z_sched_switch_spin in case of SMP. Though it's not likely that this issue
 	 * will reproduce in real world as there is some gap before reading switch_handle and
 	 * reading rest of the data we've stored before.
 	 */
diff --git a/arch/arm64/core/switch.S b/arch/arm64/core/switch.S
@@ -75,7 +75,7 @@ SECTION_FUNC(TEXT, z_arm64_context_switch)
 #endif
 
 	/* save old thread into switch handle which is required by
-	 * wait_for_switch
+	 * z_sched_switch_spin()
 	 */
 	 str	x1, [x1, #___thread_t_switch_handle_OFFSET]
 
diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h
@@ -30,16 +30,30 @@ void z_smp_release_global_lock(struct k_thread *thread);
 /* context switching and scheduling-related routines */
 #ifdef CONFIG_USE_SWITCH
 
-/* There is an unavoidable SMP race when threads swap -- their thread
- * record is in the queue (and visible to other CPUs) before
- * arch_switch() finishes saving state.  We must spin for the switch
- * handle before entering a new thread.  See docs on arch_switch().
+/* Spin, with the scheduler lock held (!), on a thread that is known
+ * (!!) to have released the lock and be on a path where it will
+ * deterministically (!!!) reach arch_switch() in very small constant
+ * time.
+ *
+ * This exists to treat an unavoidable SMP race when threads swap --
+ * their thread record is in the queue (and visible to other CPUs)
+ * before arch_switch() finishes saving state.  We must spin for the
+ * switch handle before entering a new thread.  See docs on
+ * arch_switch().
+ *
+ * Stated differently: there's a chicken and egg bug with the question
+ * of "is a thread running or not?".  The thread needs to mark itself
+ * "not running" from its own context, but at that moment it obviously
+ * is still running until it reaches arch_switch()!  Locking can't
+ * treat this because the scheduler lock can't be released by the
+ * switched-to thread, which is going to (obviously) be running its
+ * own code and doesn't know it was switched out.
  *
  * Note: future SMP architectures may need a fence/barrier or cache
  * invalidation here.  Current ones don't, and sadly Zephyr doesn't
  * have a framework for that yet.
  */
-static inline void wait_for_switch(struct k_thread *thread)
+static inline void z_sched_switch_spin(struct k_thread *thread)
 {
 #ifdef CONFIG_SMP
 	volatile void **shp = (void *)&thread->switch_handle;
@@ -117,7 +131,7 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 		}
 #endif
 		z_thread_mark_switched_out();
-		wait_for_switch(new_thread);
+		z_sched_switch_spin(new_thread);
 		_current_cpu->current = new_thread;
 
 #ifdef CONFIG_TIMESLICING
@@ -131,10 +145,9 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 		arch_cohere_stacks(old_thread, NULL, new_thread);
 
 #ifdef CONFIG_SMP
-		/* Add _current back to the run queue HERE. After
-		 * wait_for_switch() we are guaranteed to reach the
-		 * context switch in finite time, avoiding a potential
-		 * deadlock.
+		/* Now add _current back to the run queue, once we are
+		 * guaranteed to reach the context switch in finite
+		 * time.  See z_sched_switch_spin().
 		 */
 		z_requeue_current(old_thread);
 #endif
@@ -178,6 +191,11 @@ static inline void z_swap_unlocked(void)
 
 extern int arch_swap(unsigned int key);
 
+static inline void z_sched_switch_spin(struct k_thread *thread)
+{
+	ARG_UNUSED(thread);
+}
+
 static inline int z_swap_irqlock(unsigned int key)
 {
 	int ret;
diff --git a/kernel/sched.c b/kernel/sched.c
@@ -1095,7 +1095,7 @@ void *z_get_next_switch_handle(void *interrupted)
 
 		if (old_thread != new_thread) {
 			update_metairq_preempt(new_thread);
-			wait_for_switch(new_thread);
+			z_sched_switch_spin(new_thread);
 			arch_cohere_stacks(old_thread, interrupted, new_thread);
 
 			_current_cpu->swap_ok = 0;