THIS ONE'S FOR DEBUGGING STALLED BOOTS. THEY WILL HAPPEN MORE OFTEN! Change task_running check to match mainline's. Might be subtle races there. Change wait_task_inactive to only check under lock. Make schedule() layout match current mainline. Slight locking optimisation in schedule(). Get rid of scheduling from idle task check. Only update rq clock when task will be activated on wunt (performance gain). Add missing perf counter call in schedule(). Don't try_preempt in wunt when task is already running. old_state does nothing in wunt; remove it. We know this_rq on entry to try_preempt so pass it over instead of re-evaluating. sched_init oopses on high cpu numbers on kvm due to reaping of memory, so make it not __init cause I can't be arsed putting in kzalloc calls. --- kernel/sched_bfs.c | 194 ++++++++++++++++++++++++----------------------------- 1 file changed, 89 insertions(+), 105 deletions(-) Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-24 21:13:39.683129384 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-24 23:35:24.487380186 +1000 @@ -313,11 +313,6 @@ inline void update_rq_clock(struct rq *r rq->clock = sched_clock_cpu(cpu_of(rq)); } -static inline int task_running(struct task_struct *p) -{ - return (!!p->oncpu); -} - static inline void grq_lock(void) __acquires(grq.lock) { @@ -449,7 +444,17 @@ static inline void __task_grq_unlock(voi grq_unlock(); } +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + #ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(struct rq *rq, struct task_struct *p) +{ + return task_current(rq, p); +} + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { } @@ -471,9 +476,20 @@ static inline void finish_lock_switch(st } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return task_current(rq, p); +#endif +} static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + next->oncpu = 1; +#endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW grq_unlock_irq(); #else @@ -483,7 +499,15 @@ static inline void prepare_lock_switch(s static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ smp_wmb(); + prev->oncpu = 0; +#endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif @@ -504,9 +528,9 @@ static inline int task_queued(struct tas return (!list_empty(&p->run_list)); } -static inline int task_queued_only(struct task_struct *p) +static inline int task_queued_only(struct task_struct *p, struct rq *rq) { - return (!list_empty(&p->run_list) && !task_running(p)); + return (!list_empty(&p->run_list) && !task_running(rq, p)); } /* @@ -702,7 +726,10 @@ static int effective_prio(struct task_st */ static void activate_task(struct task_struct *p, struct rq *rq) { - u64 now = rq->clock; + u64 now; + + update_rq_clock(rq); + now = rq->clock; /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -861,7 +888,7 @@ void wait_task_context_switch(struct tas * iteration. */ rq = task_grq_lock(p, &flags); - running = task_running(p); + running = task_running(rq, p); task_grq_unlock(&flags); if (likely(!running)) @@ -904,39 +931,9 @@ unsigned long wait_task_inactive(struct struct rq *rq; for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since this will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(p) && p == rq->curr) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the grq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ rq = task_grq_lock(p, &flags); trace_sched_wait_task(rq, p); - running = task_running(p); + running = task_running(rq, p); on_rq = task_queued(p); ncsw = 0; if (!match_state || p->state == match_state) @@ -1034,16 +1031,13 @@ static inline int task_preempts_curr(str /* * Wake up *any* suitable cpu to schedule this task. */ -static void try_preempt(struct task_struct *p) +static void try_preempt(struct task_struct *p, struct rq *this_rq) { - struct rq *highest_prio_rq, *this_rq; unsigned long latest_deadline, cpu; + struct rq *highest_prio_rq; int highest_prio; cpumask_t tmp; - /* Try the task's previous rq first and as a fallback */ - this_rq = task_rq(p); - if (cpu_isset(this_rq->cpu, p->cpus_allowed)) { highest_prio_rq = this_rq; /* If this_rq is idle, use that. */ @@ -1124,12 +1118,11 @@ static int try_to_wake_up(struct task_st { unsigned long flags; int success = 0; - long old_state; struct rq *rq; - rq = time_task_grq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) + /* No need to update the rq clock unless we activate the task */ + rq = task_grq_lock(p, &flags); + if (!(p->state & state)) goto out_unlock; /* @@ -1147,8 +1140,9 @@ static int try_to_wake_up(struct task_st * don't trigger a preemption if there are no idle cpus, * instead waiting for current to deschedule. */ - if (!sync || (sync && suitable_idle_cpus(p))) - try_preempt(p); + if ((!sync || (sync && suitable_idle_cpus(p))) && + !task_running(rq,p)) + try_preempt(p, rq); success = 1; out_running: @@ -1216,8 +1210,9 @@ void sched_fork(struct task_struct *p, i memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; - +#endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; @@ -1259,7 +1254,7 @@ void wake_up_new_task(struct task_struct unsigned long flags; struct rq *rq; - rq = time_task_grq_lock(p, &flags); ; + rq = task_grq_lock(p, &flags); ; parent = p->parent; BUG_ON(p->state != TASK_RUNNING); set_task_cpu(p, task_cpu(parent)); @@ -1274,7 +1269,7 @@ void wake_up_new_task(struct task_struct */ resched_task(parent); } else - try_preempt(p); + try_preempt(p, rq); task_grq_unlock(&flags); } @@ -2039,10 +2034,8 @@ static void task_running_tick(struct rq /* p->time_slice <= 0. We only modify task_struct under grq lock */ grq_lock(); p = rq->curr; - if (likely(task_running(p))) { - requeue_task(p); - set_tsk_need_resched(p); - } + requeue_task(p); + set_tsk_need_resched(p); grq_unlock(); } @@ -2235,7 +2228,7 @@ out: /* * Print scheduling while atomic bug: - */ +*/ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); @@ -2289,16 +2282,19 @@ static inline void set_rq_task(struct rq /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next, *idle; + unsigned long *switch_count; int deactivate = 0, cpu; - long *switch_count; struct rq *rq; u64 now; +need_resched: + preempt_disable(); cpu = smp_processor_id(); - rq = this_rq(); + rq = cpu_rq(cpu); + idle = rq->idle; rcu_qsctr_inc(cpu); prev = rq->curr; switch_count = &prev->nivcsw; @@ -2307,21 +2303,13 @@ asmlinkage void __sched __schedule(void) need_resched_nonpreemptible: schedule_debug(prev); - idle = rq->idle; - /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. - */ - if (unlikely(prev == idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - } - grq_lock_irq(); + local_irq_disable(); update_rq_clock(rq); now = rq->clock; update_cpu_clock(rq, prev, 0); + grq_lock(); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -2362,12 +2350,11 @@ need_resched_nonpreemptible: prev->timestamp = prev->last_ran = now; if (likely(prev != next)) { - set_rq_task(rq, next); - sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, next, cpu); + + set_rq_task(rq, next); grq.nr_switches++; - next->oncpu = 1; - prev->oncpu = 0; rq->curr = next; ++*switch_count; @@ -2383,15 +2370,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); @@ -2869,15 +2850,15 @@ void rt_mutex_setprio(struct task_struct rq = time_task_grq_lock(p, &flags); oldprio = p->prio; - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); p->prio = prio; - if (task_running(p) && prio > oldprio) + if (task_running(rq, p) && prio > oldprio) resched_task(p); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } task_grq_unlock(&flags); @@ -2919,7 +2900,7 @@ void set_user_nice(struct task_struct *p p->static_prio = new_static; goto out_unlock; } - queued = task_queued_only(p); + queued = task_queued_only(p, rq); /* * If p is actually running, we don't need to do anything when * changing the priority because the grq is unaffected. @@ -2933,11 +2914,11 @@ void set_user_nice(struct task_struct *p if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } /* Just resched the task, schedule() will know what to do. */ - if (task_running(p)) + if (task_running(rq, p)) resched_task(p); out_unlock: task_grq_unlock(&flags); @@ -3062,9 +3043,10 @@ static inline struct task_struct *find_p } /* Actually do priority change: must hold grq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +static void +__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) { - BUG_ON(task_queued_only(p)); + BUG_ON(task_queued_only(p, rq)); p->policy = policy; p->rt_priority = prio; @@ -3075,7 +3057,7 @@ static void __setscheduler(struct task_s * Reschedule if running. schedule() will know if it can continue * running or not. */ - if (task_running(p)) + if (task_running(rq, p)) resched_task(p); } @@ -3207,14 +3189,14 @@ recheck: goto recheck; } update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); + __setscheduler(p, rq, policy, param->sched_priority); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } __task_grq_unlock(); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -3855,11 +3837,13 @@ void __cpuinit init_idle(struct task_str idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; - idle->oncpu = 1; set_cpuidle_map(cpu); #ifdef CONFIG_HOTPLUG_CPU idle->unplugged_mask = CPU_MASK_NONE; #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif grq_unlock_irqrestore(&flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -4039,7 +4023,7 @@ int set_cpus_allowed_ptr(struct task_str goto out; } - queued = task_queued_only(p); + queued = task_queued_only(p, rq); cpumask_copy(&p->cpus_allowed, new_mask); p->rt_nr_cpus_allowed = cpumask_weight(new_mask); @@ -4048,7 +4032,7 @@ int set_cpus_allowed_ptr(struct task_str if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (task_running(p)) { + if (task_running(rq, p)) { /* Task is running on the wrong cpu now, reschedule it. */ set_tsk_need_resched(p); running_wrong = 1; @@ -4057,7 +4041,7 @@ int set_cpus_allowed_ptr(struct task_str out: if (queued) - try_preempt(p); + try_preempt(p, rq); task_grq_unlock(&flags); if (running_wrong) @@ -4088,7 +4072,7 @@ void sched_idle_next(void) */ time_grq_lock(rq, &flags); - __setscheduler(idle, SCHED_FIFO, MAX_RT_PRIO - 1); + __setscheduler(idle, rq, SCHED_FIFO, MAX_RT_PRIO - 1); activate_idle_task(idle); set_tsk_need_resched(rq->curr); @@ -4413,7 +4397,7 @@ migration_call(struct notifier_block *nf remove_cpu(cpu); return_task(idle, 1); idle->static_prio = MAX_PRIO; - __setscheduler(idle, SCHED_NORMAL, 0); + __setscheduler(idle, rq, SCHED_NORMAL, 0); idle->prio = PRIO_LIMIT; set_rq_task(rq, idle); update_rq_clock(rq); @@ -5904,7 +5888,7 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } -void __init sched_init(void) +void sched_init(void) { int i; int highest_cpu = 0; @@ -6026,13 +6010,13 @@ void normalize_rt_tasks(void) rq = __task_grq_lock(p); update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); - __setscheduler(p, SCHED_NORMAL, 0); + __setscheduler(p, rq, SCHED_NORMAL, 0); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } __task_grq_unlock();