Looks like I was right about it being a flaky way to change cpus. Also the waking up of random runqueues that are idle means they were racing to try and grab the (wrong) task. Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-20 17:50:19.266614403 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-21 00:14:13.360306520 +1000 @@ -2008,11 +2008,8 @@ void scheduler_tick(void) update_cpu_clock(rq, rq->curr, 1); if (!rq_idle(rq)) task_running_tick(rq); - else { + else no_iso_tick(); - if (unlikely(queued_notrunning())) - set_tsk_need_resched(rq->idle); - } } notrace unsigned long get_parent_ip(unsigned long addr) @@ -3991,7 +3988,7 @@ void wake_up_idle_cpu(int cpu) int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { unsigned long flags; - int running = 0; + int running_wrong = 0; int queued = 0; struct rq *rq; int ret = 0; @@ -4017,10 +4014,11 @@ int set_cpus_allowed_ptr(struct task_str if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - /* Reschedule the task, schedule() will know if it can keep running */ - if (task_running(p)) - running = 1; - else + if (task_running(p)) { + /* Task is running on the wrong cpu now, reschedule it. */ + set_tsk_need_resched(p); + running_wrong = 1; + } else set_task_cpu(p, cpumask_any_and(cpu_online_mask, new_mask)); out: @@ -4028,9 +4026,9 @@ out: try_preempt(p); task_grq_unlock(&flags); - /* This might be a flaky way of changing cpus! */ - if (running) - schedule(); + if (running_wrong) + _cond_resched(); + return ret; } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);