Add fragile boot special wake up to cope with hardware initilisation that seems to suffer from BFS' aggressive wake to new CPU behaviour that then reverts to normal just before init. Fix up major screwup in testing for preempt. < should have been >. Use this as an opportunity to clean up and improve try_preempt. It might have been possible it was wasting effort on a cpu it couldn't schedule on as well. Index: linux-2.6.31-bfs/init/main.c =================================================================== --- linux-2.6.31-bfs.orig/init/main.c 2009-09-14 09:33:13.977147049 +1000 +++ linux-2.6.31-bfs/init/main.c 2009-09-14 15:00:57.003282292 +1000 @@ -829,6 +829,8 @@ kernel_execve(init_filename, argv_init, envp_init); } +int fragile_boot __read_mostly = 1; + /* This is a non __init function. Force it to be noinline otherwise gcc * makes it inline to init() and it becomes part of init.text section */ @@ -851,6 +853,9 @@ current->signal->flags |= SIGNAL_UNKILLABLE; + printk(KERN_INFO "Disabling Fragile boot.\n"); + fragile_boot = 0; + if (ramdisk_execute_command) { run_init_process(ramdisk_execute_command); printk(KERN_WARNING "Failed to execute %s\n", Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-14 09:33:13.997144999 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-14 16:31:09.329783746 +1000 @@ -166,6 +166,7 @@ #ifdef CONFIG_SMP cpumask_t cpu_idle_map; #endif + void (*wunt)(struct task_struct *, struct rq *, unsigned long); }; static struct global_rq grq; @@ -930,7 +931,7 @@ if (p->prio < curr->prio) preempts = 1; else if (p->policy == SCHED_NORMAL && (p->prio == curr->prio && - p->deadline < rq->rq_deadline)) + time_before(p->deadline, rq->rq_deadline))) preempts = 1; return preempts; } @@ -941,52 +942,50 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq) { unsigned long latest_deadline, cpu; - struct rq *lowest_prio_rq; - int lowest_prio; + struct rq *highest_prio_rq; + int highest_prio; cpumask_t tmp; /* Use this_rq as baseline and fall back on */ - lowest_prio_rq = this_rq; - - if (cpu_isset(this_rq->cpu, p->cpus_allowed) && rq_idle(this_rq)) { - /* this_rq is idle, use that. */ - lowest_prio_rq = this_rq; - goto found_rq; - } - + if (cpu_isset(this_rq->cpu, p->cpus_allowed)) { + highest_prio_rq = this_rq; + /* If this_rq is idle, use that. */ + if (rq_idle(this_rq)) + goto found_rq; + } else + highest_prio_rq = cpu_rq(any_online_cpu(p->cpus_allowed)); latest_deadline = this_rq->rq_deadline; - lowest_prio = this_rq->rq_prio; + highest_prio = this_rq->rq_prio; cpus_and(tmp, cpu_online_map, p->cpus_allowed); + for_each_cpu_mask(cpu, tmp) { - unsigned long rq_deadline; - int rq_prio; struct rq *rq; + int rq_prio; rq = cpu_rq(cpu); if (rq_idle(rq)) { /* found an idle rq, use that one */ - lowest_prio_rq = rq; + highest_prio_rq = rq; goto found_rq; } rq_prio = rq->rq_prio; - rq_deadline = rq->rq_deadline; - if (rq_prio < lowest_prio || - (rq_prio == lowest_prio && - rq_deadline > latest_deadline)) { - lowest_prio = rq_prio; - latest_deadline = rq_deadline; - lowest_prio_rq = rq; + if (rq_prio > highest_prio || + (rq_prio == highest_prio && + time_after(rq->rq_deadline, latest_deadline))) { + highest_prio = rq_prio; + latest_deadline = rq->rq_deadline; + highest_prio_rq = rq; } } - if (!task_preempts_curr(p, lowest_prio_rq)) + if (!task_preempts_curr(p, highest_prio_rq)) return; found_rq: - resched_task(lowest_prio_rq->curr); - lowest_prio_rq->preempt_next = p; + resched_task(highest_prio_rq->curr); + highest_prio_rq->preempt_next = p; return; } @@ -1167,13 +1166,11 @@ * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +static void +normal_wunt(struct task_struct *p, struct rq *rq, unsigned long clone_flags) { - struct task_struct *parent; - unsigned long flags; - struct rq *rq = time_task_grq_lock(p, &flags); + struct task_struct *parent = p->parent; - parent = p->parent; BUG_ON(p->state != TASK_RUNNING); set_task_cpu(p, task_cpu(parent)); @@ -1190,7 +1187,40 @@ rq->preempt_next = p; } else try_preempt(p, rq); - task_grq_unlock(&flags); +} + +extern int fragile_boot; + +/* Fragile version to not wake to other cpus during boot */ +static void +fb_wunt(struct task_struct *p, struct rq *rq, unsigned long clone_flags) +{ + struct task_struct *parent = p->parent; + + BUG_ON(p->state != TASK_RUNNING); + set_task_cpu(p, task_cpu(parent)); + + activate_task(p, rq); + trace_sched_wakeup_new(rq, p, 1); + /* Child always runs first */ + set_tsk_need_resched(parent); + rq->preempt_next = p; + /* + * fragile_boot is set initially and unset only once just before + * init so we change to normal wunt from here onwards, the ->wunt + * pointer is protected by grq lock. + */ + if (!fragile_boot) + grq.wunt = normal_wunt; +} + +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +{ + unsigned long flags; + + struct rq *rq = time_task_grq_lock(p, &flags); + grq.wunt(p, rq, clone_flags); + grq_unlock_irqrestore(&flags); } /* @@ -5937,6 +5967,7 @@ cpus_clear(grq.cpu_idle_map); #endif spin_lock_init(&grq.lock); + grq.wunt = fb_wunt; for_each_possible_cpu(i) { struct rq *rq;