Fragile boot only helped decrease the rate of failed boots, it didn't fix the problem. Remove it. There is a theoretical way a task can be affined to only one cpu and not wake that cpu up. Fix it. Remove 0 as option for rr_interval. It's pointless anyway and just would have needed more tests for divide-by-zero. Change try_preempt to try the task's previous runqueue first instead of the runqueue of the calling cpu, to maintain a bit of extra cache warmth. Index: linux-2.6.31-bfs/init/main.c =================================================================== --- linux-2.6.31-bfs.orig/init/main.c 2009-09-15 22:50:37.645347487 +1000 +++ linux-2.6.31-bfs/init/main.c 2009-09-15 22:51:02.984350856 +1000 @@ -829,8 +829,6 @@ kernel_execve(init_filename, argv_init, envp_init); } -int fragile_boot __read_mostly = 1; - /* This is a non __init function. Force it to be noinline otherwise gcc * makes it inline to init() and it becomes part of init.text section */ @@ -853,9 +851,6 @@ current->signal->flags |= SIGNAL_UNKILLABLE; - printk(KERN_INFO "Disabling Fragile boot.\n"); - fragile_boot = 0; - if (ramdisk_execute_command) { run_init_process(ramdisk_execute_command); printk(KERN_WARNING "Failed to execute %s\n", Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-15 22:50:37.616347206 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-15 23:18:49.065950223 +1000 @@ -166,7 +166,6 @@ #ifdef CONFIG_SMP cpumask_t cpu_idle_map; #endif - void (*wunt)(struct task_struct *, struct rq *, unsigned long); }; static struct global_rq grq; @@ -939,14 +938,16 @@ /* * Wake up *any* suitable cpu to schedule this task. */ -static void try_preempt(struct task_struct *p, struct rq *this_rq) +static void try_preempt(struct task_struct *p) { + struct rq *highest_prio_rq, *this_rq; unsigned long latest_deadline, cpu; - struct rq *highest_prio_rq; int highest_prio; cpumask_t tmp; - /* Use this_rq as baseline and fall back on */ + /* Try the task's previous rq first and as a fallback */ + this_rq = task_rq(p); + if (cpu_isset(this_rq->cpu, p->cpus_allowed)) { highest_prio_rq = this_rq; /* If this_rq is idle, use that. */ @@ -1059,7 +1060,7 @@ * instead waiting for current to deschedule. */ if (!sync || (sync && !no_idle_cpus())) - try_preempt(p, rq); + try_preempt(p); success = 1; out_running: @@ -1166,14 +1167,15 @@ * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -static void -normal_wunt(struct task_struct *p, struct rq *rq, unsigned long clone_flags) +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { struct task_struct *parent = p->parent; + unsigned long flags; + struct rq *rq; BUG_ON(p->state != TASK_RUNNING); - set_task_cpu(p, task_cpu(parent)); + rq = time_task_grq_lock(p, &flags); activate_task(p, rq); trace_sched_wakeup_new(rq, p, 1); if (!(clone_flags & CLONE_VM) && rq->curr == parent && @@ -1186,41 +1188,8 @@ set_tsk_need_resched(parent); rq->preempt_next = p; } else - try_preempt(p, rq); -} - -extern int fragile_boot; - -/* Fragile version to not wake to other cpus during boot */ -static void -fb_wunt(struct task_struct *p, struct rq *rq, unsigned long clone_flags) -{ - struct task_struct *parent = p->parent; - - BUG_ON(p->state != TASK_RUNNING); - set_task_cpu(p, task_cpu(parent)); - - activate_task(p, rq); - trace_sched_wakeup_new(rq, p, 1); - /* Child always runs first */ - set_tsk_need_resched(parent); - rq->preempt_next = p; - /* - * fragile_boot is set initially and unset only once just before - * init so we change to normal wunt from here onwards, the ->wunt - * pointer is protected by grq lock. - */ - if (!fragile_boot) - grq.wunt = normal_wunt; -} - -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) -{ - unsigned long flags; - - struct rq *rq = time_task_grq_lock(p, &flags); - grq.wunt(p, rq, clone_flags); - grq_unlock_irqrestore(&flags); + try_preempt(p); + task_grq_unlock(&flags); } /* @@ -2928,7 +2897,7 @@ resched_task(p); if (queued) { enqueue_task(p); - try_preempt(p, rq); + try_preempt(p); } /* @@ -2991,7 +2960,7 @@ if (queued) { enqueue_task(p); - try_preempt(p, rq); + try_preempt(p); } /* Just resched the task, schedule() will know what to do. */ @@ -3274,7 +3243,7 @@ resched_task(p); if (queued) { enqueue_task(p); - try_preempt(p, rq); + try_preempt(p); } __task_grq_unlock(); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4083,9 +4052,10 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { unsigned long flags; + int running = 0; + int queued = 0; struct rq *rq; int ret = 0; - int running = 0; rq = task_grq_lock(p, &flags); if (!cpumask_intersects(new_mask, cpu_online_mask)) { @@ -4099,6 +4069,8 @@ goto out; } + queued = task_queued(p); + cpumask_copy(&p->cpus_allowed, new_mask); p->rt_nr_cpus_allowed = cpumask_weight(new_mask); @@ -4113,6 +4085,8 @@ set_task_cpu(p, cpumask_any_and(cpu_online_mask, new_mask)); out: + if (queued) + try_preempt(p); task_grq_unlock(&flags); /* This might be a flaky way of changing cpus! */ @@ -5967,7 +5941,6 @@ cpus_clear(grq.cpu_idle_map); #endif spin_lock_init(&grq.lock); - grq.wunt = fb_wunt; for_each_possible_cpu(i) { struct rq *rq; @@ -6085,7 +6058,7 @@ resched_task(p); if (queued) { enqueue_task(p); - try_preempt(p, rq); + try_preempt(p); } __task_grq_unlock(); Index: linux-2.6.31-bfs/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.31-bfs.orig/Documentation/sysctl/kernel.txt 2009-09-15 23:19:31.470075641 +1000 +++ linux-2.6.31-bfs/Documentation/sysctl/kernel.txt 2009-09-15 23:19:53.210068612 +1000 @@ -353,11 +353,9 @@ overall. Conversely decreasing it will decrease average and maximum latencies but at the expense of throughput. This value is in milliseconds and the default value chosen depends on the number of -cpus available at scheduler initialisation with a minimum of 6. The -value can be set to 0 which means no more than one tick (limited -by HZ resolution). +cpus available at scheduler initialisation with a minimum of 6. -Valid values are from 0-5000. +Valid values are from 1-5000. ============================================================== Index: linux-2.6.31-bfs/kernel/sysctl.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sysctl.c 2009-09-15 23:20:01.326206072 +1000 +++ linux-2.6.31-bfs/kernel/sysctl.c 2009-09-15 23:20:13.952950214 +1000 @@ -682,7 +682,7 @@ .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, - .extra1 = &zero, + .extra1 = &one, .extra2 = &five_thousand, }, {