Change task_running check to match mainline's implementation. The current one might have subtle races around context switching. Wait task inactive does not work lockfree on bfs. This one could be a big one. Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-23 21:57:28.581105081 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-24 00:06:13.894129792 +1000 @@ -313,11 +313,6 @@ inline void update_rq_clock(struct rq *r rq->clock = sched_clock_cpu(cpu_of(rq)); } -static inline int task_running(struct task_struct *p) -{ - return (!!p->oncpu); -} - static inline void grq_lock(void) __acquires(grq.lock) { @@ -449,7 +444,17 @@ static inline void __task_grq_unlock(voi grq_unlock(); } +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + #ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(struct rq *rq, struct task_struct *p) +{ + return task_current(rq, p); +} + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { } @@ -471,9 +476,20 @@ static inline void finish_lock_switch(st } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return task_current(rq, p); +#endif +} static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + next->oncpu = 1; +#endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW grq_unlock_irq(); #else @@ -483,7 +499,15 @@ static inline void prepare_lock_switch(s static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ smp_wmb(); + prev->oncpu = 0; +#endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif @@ -504,9 +528,9 @@ static inline int task_queued(struct tas return (!list_empty(&p->run_list)); } -static inline int task_queued_only(struct task_struct *p) +static inline int task_queued_only(struct task_struct *p, struct rq *rq) { - return (!list_empty(&p->run_list) && !task_running(p)); + return (!list_empty(&p->run_list) && !task_running(rq, p)); } /* @@ -861,7 +885,7 @@ void wait_task_context_switch(struct tas * iteration. */ rq = task_grq_lock(p, &flags); - running = task_running(p); + running = task_running(rq, p); task_grq_unlock(&flags); if (likely(!running)) @@ -904,39 +928,9 @@ unsigned long wait_task_inactive(struct struct rq *rq; for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since this will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(p) && p == rq->curr) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the grq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ rq = task_grq_lock(p, &flags); trace_sched_wait_task(rq, p); - running = task_running(p); + running = task_running(rq, p); on_rq = task_queued(p); ncsw = 0; if (!match_state || p->state == match_state) @@ -1216,8 +1210,9 @@ void sched_fork(struct task_struct *p, i memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; - +#endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; @@ -2039,10 +2034,8 @@ static void task_running_tick(struct rq /* p->time_slice <= 0. We only modify task_struct under grq lock */ grq_lock(); p = rq->curr; - if (likely(task_running(p))) { - requeue_task(p); - set_tsk_need_resched(p); - } + requeue_task(p); + set_tsk_need_resched(p); grq_unlock(); } @@ -2308,14 +2301,6 @@ need_resched_nonpreemptible: schedule_debug(prev); idle = rq->idle; - /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. - */ - if (unlikely(prev == idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - } grq_lock_irq(); update_rq_clock(rq); @@ -2366,8 +2351,6 @@ need_resched_nonpreemptible: sched_info_switch(prev, next); grq.nr_switches++; - next->oncpu = 1; - prev->oncpu = 0; rq->curr = next; ++*switch_count; @@ -2869,11 +2852,11 @@ void rt_mutex_setprio(struct task_struct rq = time_task_grq_lock(p, &flags); oldprio = p->prio; - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); p->prio = prio; - if (task_running(p) && prio > oldprio) + if (task_running(rq, p) && prio > oldprio) resched_task(p); if (queued) { enqueue_task(p); @@ -2919,7 +2902,7 @@ void set_user_nice(struct task_struct *p p->static_prio = new_static; goto out_unlock; } - queued = task_queued_only(p); + queued = task_queued_only(p, rq); /* * If p is actually running, we don't need to do anything when * changing the priority because the grq is unaffected. @@ -2937,7 +2920,7 @@ void set_user_nice(struct task_struct *p } /* Just resched the task, schedule() will know what to do. */ - if (task_running(p)) + if (task_running(rq, p)) resched_task(p); out_unlock: task_grq_unlock(&flags); @@ -3062,9 +3045,10 @@ static inline struct task_struct *find_p } /* Actually do priority change: must hold grq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +static void +__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) { - BUG_ON(task_queued_only(p)); + BUG_ON(task_queued_only(p, rq)); p->policy = policy; p->rt_priority = prio; @@ -3075,7 +3059,7 @@ static void __setscheduler(struct task_s * Reschedule if running. schedule() will know if it can continue * running or not. */ - if (task_running(p)) + if (task_running(rq, p)) resched_task(p); } @@ -3207,11 +3191,11 @@ recheck: goto recheck; } update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); + __setscheduler(p, rq, policy, param->sched_priority); if (queued) { enqueue_task(p); try_preempt(p); @@ -3855,11 +3839,13 @@ void __cpuinit init_idle(struct task_str idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; - idle->oncpu = 1; set_cpuidle_map(cpu); #ifdef CONFIG_HOTPLUG_CPU idle->unplugged_mask = CPU_MASK_NONE; #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif grq_unlock_irqrestore(&flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -4039,7 +4025,7 @@ int set_cpus_allowed_ptr(struct task_str goto out; } - queued = task_queued_only(p); + queued = task_queued_only(p, rq); cpumask_copy(&p->cpus_allowed, new_mask); p->rt_nr_cpus_allowed = cpumask_weight(new_mask); @@ -4048,7 +4034,7 @@ int set_cpus_allowed_ptr(struct task_str if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (task_running(p)) { + if (task_running(rq, p)) { /* Task is running on the wrong cpu now, reschedule it. */ set_tsk_need_resched(p); running_wrong = 1; @@ -4088,7 +4074,7 @@ void sched_idle_next(void) */ time_grq_lock(rq, &flags); - __setscheduler(idle, SCHED_FIFO, MAX_RT_PRIO - 1); + __setscheduler(idle, rq, SCHED_FIFO, MAX_RT_PRIO - 1); activate_idle_task(idle); set_tsk_need_resched(rq->curr); @@ -4413,7 +4399,7 @@ migration_call(struct notifier_block *nf remove_cpu(cpu); return_task(idle, 1); idle->static_prio = MAX_PRIO; - __setscheduler(idle, SCHED_NORMAL, 0); + __setscheduler(idle, rq, SCHED_NORMAL, 0); idle->prio = PRIO_LIMIT; set_rq_task(rq, idle); update_rq_clock(rq); @@ -6026,10 +6012,10 @@ void normalize_rt_tasks(void) rq = __task_grq_lock(p); update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); - __setscheduler(p, SCHED_NORMAL, 0); + __setscheduler(p, rq, SCHED_NORMAL, 0); if (queued) { enqueue_task(p); try_preempt(p);