Index: linux-2.6.30-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.30-bfs.orig/kernel/sched_bfs.c 2009-09-04 21:19:05.480630425 +1000 +++ linux-2.6.30-bfs/kernel/sched_bfs.c 2009-09-04 21:46:16.936724134 +1000 @@ -1071,6 +1071,18 @@ put_cpu(); } +#ifdef CONFIG_SMP +static int no_idle_cpus(void) +{ + return (cpus_empty(grq.cpu_idle_map)); +} +#else +static int no_idle_cpus(void) +{ + return 1; +} +#endif + /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -1090,17 +1102,15 @@ activate_task(p); trace_sched_wakeup_new(rq, p, 1); - if (!(clone_flags & CLONE_VM) && rq->curr == parent) { + if (!(clone_flags & CLONE_VM) && rq->curr == parent && + no_idle_cpus()) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. The parent is the - * one time that a task that is descheduled on SMP does not - * immediately get to look for a cpu here, so do it in - * schedule(). + * usually avoids a lot of COW overhead. */ - set_tsk_need_resched(parent); - rq->preempt_next = p; + set_tsk_need_resched(parent); + rq->preempt_next = p; } else try_preempt(p, rq); task_grq_unlock(&flags); @@ -1860,13 +1870,15 @@ { unsigned long long_deadline, shortest_deadline; struct task_struct *edt, *p; + unsigned int cpu = rq->cpu; struct list_head *queue; int idx = 0; if (rq->preempt_next) { - if (likely(task_queued(rq->preempt_next))) { - edt = rq->preempt_next; - goto out_take; + if (likely(task_queued(rq->preempt_next) && + cpu_isset(cpu, rq->preempt_next->cpus_allowed))) { + edt = rq->preempt_next; + goto out_take; } } retry: @@ -1875,7 +1887,7 @@ if (idx < MAX_RT_PRIO) { /* We found rt tasks */ list_for_each_entry(p, queue, run_list) { - if (cpu_isset(rq->cpu, p->cpus_allowed)) { + if (cpu_isset(cpu, p->cpus_allowed)) { edt = p; goto out_take; } @@ -1896,7 +1908,7 @@ list_for_each_entry(p, queue, run_list) { unsigned long deadline_diff; /* Make sure cpu affinity is ok */ - if (!cpu_isset(rq->cpu, p->cpus_allowed)) + if (!cpu_isset(cpu, p->cpus_allowed)) continue; deadline_diff = p->deadline - jiffies; @@ -2072,25 +2084,10 @@ schedstat_inc(rq, sched_goidle); } - /* - * If we find an idle cpu that we can wake next onto, we wake that one - * up and move next to that cpu. This allows prev to stay on this cpu - * for cache benefits. This is also where the parent from - * wake_up_new_task doesn't miss an opportunity to schedule its child - * onto another cpu. Optimised out on !SMP. - */ if (next == rq->idle) set_cpuidle_map(cpu); - else { + else clear_cpuidle_map(cpu); - if (prev != next && prev != rq->idle && !deactivate && - cpu_isset(cpu, prev->cpus_allowed) && - idle_cpu_available(next)) { - return_task(next, 0); - next = prev; - take_task(rq, next); - } - } rq->preempt_next = NULL; @@ -2599,6 +2596,8 @@ if (queued) dequeue_task(p); p->prio = prio; + if (task_running(p)) + resched_task(p); if (queued) { enqueue_task(p); try_preempt(p, rq); @@ -2662,16 +2661,14 @@ p->static_prio = new_static; p->prio = effective_prio(p); - if (queued) + if (queued) { enqueue_task(p); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (task_running(p) && delta > 0) - resched_task(p); - else if (queued && !task_running(p) && delta < 0) try_preempt(p, rq); + } + + /* Just resched the task, schedule() will know what to do. */ + if (task_running(p)) + resched_task(p); out_unlock: task_grq_unlock(&flags); } @@ -2945,6 +2942,8 @@ dequeue_task(p); oldprio = p->prio; __setscheduler(p, policy, param->sched_priority); + if (task_running(p)) + resched_task(p); if (queued) { enqueue_task(p); try_preempt(p, rq); @@ -3801,7 +3800,7 @@ __setscheduler(idle, SCHED_FIFO, MAX_RT_PRIO - 1); update_rq_clock(rq); - activate_task(idle); + activate_idle_task(idle); rq->preempt_next = idle; resched_task(rq->curr); @@ -4010,9 +4009,11 @@ #ifdef CONFIG_HOTPLUG_CPU /* - * This cpu is going down, so remove it from the cpus_allowed. No need to do - * anything special since they'll just move on next reschedule if they're - * running, and they're not on a cpu if they're in the global queue. + * This cpu is going down, so walk over the tasklist and find tasks that can + * only run on this cpu and remove their affinity. Store their value in + * unplugged_mask so it can be restored once their correct cpu is online. No + * need to do anything special since they'll just move on next reschedule if + * they're running. */ static void remove_cpu(unsigned long cpu) { @@ -4021,16 +4022,14 @@ read_lock(&tasklist_lock); do_each_thread(t, p) { - /* - * Store the "real" affinity in unplugged_mask. Copy the - * allowed cpus if it has not yet been set. - */ - if (cpus_empty(p->unplugged_mask)) + cpumask_t cpus_remaining; + + cpus_and(cpus_remaining, p->cpus_allowed, cpu_online_map); + cpu_clear(cpu, cpus_remaining); + if (cpus_empty(cpus_remaining)) { p->unplugged_mask = p->cpus_allowed; - cpu_clear(cpu, p->cpus_allowed); - if (cpus_empty(p->cpus_allowed)) - cpus_andnot(p->cpus_allowed, cpu_online_map, - cpumask_of_cpu(cpu)); + p->cpus_allowed = cpu_possible_map; + } } while_each_thread(t, p); read_unlock(&tasklist_lock); @@ -5716,9 +5715,11 @@ if (queued) dequeue_task(p); __setscheduler(p, SCHED_NORMAL, 0); + if (task_running(p)) + resched_task(p); if (queued) { enqueue_task(p); - resched_task(rq->curr); + try_preempt(p, rq); } __task_grq_unlock();