Change task_running check to match mainline's. Might be subtle races there. Change wait_task_inactive to only check under lock. Slight locking optimisation in schedule(). Get rid of scheduling from idle task check. Only update rq clock when task will be activated on ttwu (performance gain). Add missing perf counter calls. Don't try_preempt in ttwu when task is already running. old_state does nothing in ttwu; remove it, but cast p->state to same type. We know this_rq on entry to try_preempt so pass it over instead of re-evaluating. In try_preempt we *might* have been checking a cpu the task couldn't schedule on, forcing an inappropriate reschedule. sched_init oopses on high cpu numbers on kvm due to reaping of memory, so make it not __init cause I can't be arsed putting in kzalloc calls. inline a few one liners and single call site functions. Add rq_last_ran to locally store the last_ran value on the runqueue allowing safe update without taking grq lock. Task is always task_running in task_running_tick so no need to test for it. Change switch_count to unsigned long to match mainline, along with other __schedule() changes to match. Export __schedule symbol (even though it will go away in the future) just for completeness. Call set_rq_task to copy all the rq_ fields from init_idle. Indentation changes. --- kernel/sched_bfs.c | 247 ++++++++++++++++++++++++++--------------------------- 1 file changed, 123 insertions(+), 124 deletions(-) Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-27 09:37:02.525762018 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-27 10:03:11.372886332 +1000 @@ -190,6 +190,7 @@ struct rq { unsigned long rq_deadline; unsigned int rq_policy; int rq_time_slice; + u64 rq_last_ran; int rq_prio; /* Accurate timekeeping data */ @@ -313,11 +314,6 @@ inline void update_rq_clock(struct rq *r rq->clock = sched_clock_cpu(cpu_of(rq)); } -static inline int task_running(struct task_struct *p) -{ - return (!!p->oncpu); -} - static inline void grq_lock(void) __acquires(grq.lock) { @@ -417,12 +413,12 @@ static inline void task_grq_unlock(unsig * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -int grunqueue_is_locked(void) +inline int grunqueue_is_locked(void) { return spin_is_locked(&grq.lock); } -void grq_unlock_wait(void) +inline void grq_unlock_wait(void) __releases(grq.lock) { smp_mb(); /* spin-unlock-wait is not a full memory barrier */ @@ -449,7 +445,17 @@ static inline void __task_grq_unlock(voi grq_unlock(); } +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + #ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(struct rq *rq, struct task_struct *p) +{ + return task_current(rq, p); +} + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { } @@ -471,9 +477,20 @@ static inline void finish_lock_switch(st } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return task_current(rq, p); +#endif +} static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + next->oncpu = 1; +#endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW grq_unlock_irq(); #else @@ -483,7 +500,15 @@ static inline void prepare_lock_switch(s static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ smp_wmb(); + prev->oncpu = 0; +#endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif @@ -504,9 +529,9 @@ static inline int task_queued(struct tas return (!list_empty(&p->run_list)); } -static inline int task_queued_only(struct task_struct *p) +static inline int task_queued_only(struct task_struct *p, struct rq *rq) { - return (!list_empty(&p->run_list) && !task_running(p)); + return (!list_empty(&p->run_list) && !task_running(rq, p)); } /* @@ -646,7 +671,7 @@ static inline void clear_cpuidle_map(uns } /* Always called from a busy cpu on UP */ -static int suitable_idle_cpus(struct task_struct *p) +static inline int suitable_idle_cpus(struct task_struct *p) { return 0; } @@ -702,7 +727,10 @@ static int effective_prio(struct task_st */ static void activate_task(struct task_struct *p, struct rq *rq) { - u64 now = rq->clock; + u64 now; + + update_rq_clock(rq); + now = rq->clock; /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -739,6 +767,8 @@ static inline void deactivate_task(struc void set_task_cpu(struct task_struct *p, unsigned int cpu) { trace_sched_migrate_task(p, cpu); + perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); /* * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be * successfuly executed on another CPU. We must ensure that updates of @@ -861,7 +891,7 @@ void wait_task_context_switch(struct tas * iteration. */ rq = task_grq_lock(p, &flags); - running = task_running(p); + running = task_running(rq, p); task_grq_unlock(&flags); if (likely(!running)) @@ -904,39 +934,9 @@ unsigned long wait_task_inactive(struct struct rq *rq; for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since this will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(p) && p == rq->curr) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the grq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ rq = task_grq_lock(p, &flags); trace_sched_wait_task(rq, p); - running = task_running(p); + running = task_running(rq, p); on_rq = task_queued(p); ncsw = 0; if (!match_state || p->state == match_state) @@ -1026,35 +1026,35 @@ static inline int task_preempts_curr(str if (p->prio < rq->rq_prio) preempts = 1; else if (p->policy == SCHED_NORMAL && (p->prio == rq->rq_prio && - time_before(p->deadline, rq->rq_deadline))) + time_before(p->deadline, rq->rq_deadline)) && + time_after(p->deadline, jiffies)) preempts = 1; + return preempts; } /* * Wake up *any* suitable cpu to schedule this task. */ -static void try_preempt(struct task_struct *p) +static void try_preempt(struct task_struct *p, struct rq *this_rq) { - struct rq *highest_prio_rq, *this_rq; unsigned long latest_deadline, cpu; + struct rq *highest_prio_rq; int highest_prio; cpumask_t tmp; - /* Try the task's previous rq first and as a fallback */ - this_rq = task_rq(p); + cpus_and(tmp, cpu_online_map, p->cpus_allowed); - if (cpu_isset(this_rq->cpu, p->cpus_allowed)) { + /* Use this_rq as fallback */ + if (likely(cpu_isset(this_rq->cpu, tmp))) { highest_prio_rq = this_rq; /* If this_rq is idle, use that. */ - if (rq_idle(this_rq)) + if (rq_idle(highest_prio_rq)) goto found_rq; } else - highest_prio_rq = cpu_rq(any_online_cpu(p->cpus_allowed)); - latest_deadline = this_rq->rq_deadline; - highest_prio = this_rq->rq_prio; - - cpus_and(tmp, cpu_online_map, p->cpus_allowed); + highest_prio_rq = cpu_rq(any_online_cpu(tmp)); + latest_deadline = highest_prio_rq->rq_deadline; + highest_prio = highest_prio_rq->rq_prio; for_each_cpu_mask(cpu, tmp) { struct rq *rq; @@ -1069,12 +1069,11 @@ static void try_preempt(struct task_stru } rq_prio = rq->rq_prio; - if (rq_prio > highest_prio || - (rq_prio == highest_prio && - time_after(rq->rq_deadline, latest_deadline))) { - highest_prio = rq_prio; - latest_deadline = rq->rq_deadline; - highest_prio_rq = rq; + if (rq_prio > highest_prio || (rq_prio == highest_prio && + time_after(rq->rq_deadline, latest_deadline))) { + highest_prio = rq_prio; + latest_deadline = rq->rq_deadline; + highest_prio_rq = rq; } } @@ -1124,18 +1123,22 @@ static int try_to_wake_up(struct task_st { unsigned long flags; int success = 0; - long old_state; struct rq *rq; - rq = time_task_grq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) + /* + * No need to do time_lock_grq as we only need to update the rq clock + * if we activate the task + */ + rq = task_grq_lock(p, &flags); + + /* state is a volatile long, どうして、分からない */ + if (!(unsigned int)p->state & state) goto out_unlock; /* * Note this catches tasks that are running and queued, but returns * false during the context switch when they're running and no - * longer queued. + * longer queued (rare). */ if (task_queued(p)) goto out_running; @@ -1147,8 +1150,9 @@ static int try_to_wake_up(struct task_st * don't trigger a preemption if there are no idle cpus, * instead waiting for current to deschedule. */ - if (!sync || (sync && suitable_idle_cpus(p))) - try_preempt(p); + if ((!sync || (sync && suitable_idle_cpus(p))) && + !task_running(rq,p)) + try_preempt(p, rq); success = 1; out_running: @@ -1216,8 +1220,9 @@ void sched_fork(struct task_struct *p, i memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; - +#endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; @@ -1259,7 +1264,7 @@ void wake_up_new_task(struct task_struct unsigned long flags; struct rq *rq; - rq = time_task_grq_lock(p, &flags); ; + rq = task_grq_lock(p, &flags); ; parent = p->parent; BUG_ON(p->state != TASK_RUNNING); set_task_cpu(p, task_cpu(parent)); @@ -1274,7 +1279,7 @@ void wake_up_new_task(struct task_struct */ resched_task(parent); } else - try_preempt(p); + try_preempt(p, rq); task_grq_unlock(&flags); } @@ -1718,7 +1723,7 @@ static void pc_user_time(struct rq *rq, static void update_cpu_clock(struct rq *rq, struct task_struct *p, int tick) { - long time_diff = rq->clock - p->last_ran; + long time_diff = rq->clock - rq->rq_last_ran; long account_ns = rq->clock - rq->timekeep_clock; struct task_struct *idle = rq->idle; unsigned long account_pc; @@ -1760,7 +1765,7 @@ update_cpu_clock(struct rq *rq, struct t /* time_slice accounting is done in usecs to avoid overflow on 32bit */ if (rq->rq_policy != SCHED_FIFO && p != idle) rq->rq_time_slice -= time_diff / 1000; - p->last_ran = rq->timekeep_clock = rq->clock; + rq->rq_last_ran = rq->timekeep_clock = rq->clock; } /* @@ -1775,7 +1780,7 @@ static u64 do_task_delta_exec(struct tas if (p == rq->curr) { update_rq_clock(rq); - ns = rq->clock - p->last_ran; + ns = rq->clock - rq->rq_last_ran; if ((s64)ns < 0) ns = 0; } @@ -2039,10 +2044,8 @@ static void task_running_tick(struct rq /* p->time_slice <= 0. We only modify task_struct under grq lock */ grq_lock(); p = rq->curr; - if (likely(task_running(p))) { - requeue_task(p); - set_tsk_need_resched(p); - } + requeue_task(p); + set_tsk_need_resched(p); grq_unlock(); } @@ -2065,6 +2068,7 @@ void scheduler_tick(void) task_running_tick(rq); else no_iso_tick(); + perf_counter_task_tick(rq->curr, cpu); } notrace unsigned long get_parent_ip(unsigned long addr) @@ -2235,7 +2239,7 @@ out: /* * Print scheduling while atomic bug: - */ +*/ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); @@ -2282,6 +2286,7 @@ static inline void set_rq_task(struct rq { rq->rq_time_slice = p->time_slice; rq->rq_deadline = p->deadline; + rq->rq_last_ran = p->last_ran; rq->rq_policy = p->policy; rq->rq_prio = p->prio; } @@ -2292,13 +2297,14 @@ static inline void set_rq_task(struct rq asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next, *idle; + unsigned long *switch_count; int deactivate = 0, cpu; - long *switch_count; struct rq *rq; u64 now; cpu = smp_processor_id(); - rq = this_rq(); + rq = cpu_rq(cpu); + idle = rq->idle; rcu_qsctr_inc(cpu); prev = rq->curr; switch_count = &prev->nivcsw; @@ -2307,21 +2313,13 @@ asmlinkage void __sched __schedule(void) need_resched_nonpreemptible: schedule_debug(prev); - idle = rq->idle; - /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. - */ - if (unlikely(prev == idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - } - grq_lock_irq(); + local_irq_disable(); update_rq_clock(rq); now = rq->clock; update_cpu_clock(rq, prev, 0); + grq_lock(); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -2362,12 +2360,11 @@ need_resched_nonpreemptible: prev->timestamp = prev->last_ran = now; if (likely(prev != next)) { - set_rq_task(rq, next); - sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, next, cpu); + + set_rq_task(rq, next); grq.nr_switches++; - next->oncpu = 1; - prev->oncpu = 0; rq->curr = next; ++*switch_count; @@ -2384,6 +2381,7 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; } +EXPORT_SYMBOL(__schedule); asmlinkage void __sched schedule(void) { @@ -2391,8 +2389,8 @@ need_resched: preempt_disable(); __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + if (need_resched()) + goto need_resched; } EXPORT_SYMBOL(schedule); @@ -2869,15 +2867,15 @@ void rt_mutex_setprio(struct task_struct rq = time_task_grq_lock(p, &flags); oldprio = p->prio; - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); p->prio = prio; - if (task_running(p) && prio > oldprio) + if (task_running(rq, p) && prio > oldprio) resched_task(p); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } task_grq_unlock(&flags); @@ -2889,7 +2887,7 @@ void rt_mutex_setprio(struct task_struct * Adjust the deadline for when the priority is to change, before it's * changed. */ -static void adjust_deadline(struct task_struct *p, int new_prio) +static inline void adjust_deadline(struct task_struct *p, int new_prio) { p->deadline += (prio_ratios[USER_PRIO(new_prio)] - pratio(p)) * rr_interval * HZ / 1000 / 100; @@ -2919,7 +2917,7 @@ void set_user_nice(struct task_struct *p p->static_prio = new_static; goto out_unlock; } - queued = task_queued_only(p); + queued = task_queued_only(p, rq); /* * If p is actually running, we don't need to do anything when * changing the priority because the grq is unaffected. @@ -2933,11 +2931,11 @@ void set_user_nice(struct task_struct *p if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } /* Just resched the task, schedule() will know what to do. */ - if (task_running(p)) + if (task_running(rq, p)) resched_task(p); out_unlock: task_grq_unlock(&flags); @@ -3062,9 +3060,10 @@ static inline struct task_struct *find_p } /* Actually do priority change: must hold grq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +static void +__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) { - BUG_ON(task_queued_only(p)); + BUG_ON(task_queued_only(p, rq)); p->policy = policy; p->rt_priority = prio; @@ -3075,7 +3074,7 @@ static void __setscheduler(struct task_s * Reschedule if running. schedule() will know if it can continue * running or not. */ - if (task_running(p)) + if (task_running(rq, p)) resched_task(p); } @@ -3207,14 +3206,14 @@ recheck: goto recheck; } update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); + __setscheduler(p, rq, policy, param->sched_priority); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } __task_grq_unlock(); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -3848,18 +3847,18 @@ void __cpuinit init_idle(struct task_str idle->timestamp = idle->last_ran = rq->clock; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ - idle->prio = rq->rq_prio = PRIO_LIMIT; - rq->rq_deadline = idle->deadline; - rq->rq_policy = idle->policy; - rq->rq_time_slice = idle->time_slice; + idle->prio = PRIO_LIMIT; + set_rq_task(rq, idle); idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; - idle->oncpu = 1; set_cpuidle_map(cpu); #ifdef CONFIG_HOTPLUG_CPU idle->unplugged_mask = CPU_MASK_NONE; #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif grq_unlock_irqrestore(&flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -4039,7 +4038,7 @@ int set_cpus_allowed_ptr(struct task_str goto out; } - queued = task_queued_only(p); + queued = task_queued_only(p, rq); cpumask_copy(&p->cpus_allowed, new_mask); p->rt_nr_cpus_allowed = cpumask_weight(new_mask); @@ -4048,7 +4047,7 @@ int set_cpus_allowed_ptr(struct task_str if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (task_running(p)) { + if (task_running(rq, p)) { /* Task is running on the wrong cpu now, reschedule it. */ set_tsk_need_resched(p); running_wrong = 1; @@ -4057,7 +4056,7 @@ int set_cpus_allowed_ptr(struct task_str out: if (queued) - try_preempt(p); + try_preempt(p, rq); task_grq_unlock(&flags); if (running_wrong) @@ -4088,7 +4087,7 @@ void sched_idle_next(void) */ time_grq_lock(rq, &flags); - __setscheduler(idle, SCHED_FIFO, MAX_RT_PRIO - 1); + __setscheduler(idle, rq, SCHED_FIFO, MAX_RT_PRIO - 1); activate_idle_task(idle); set_tsk_need_resched(rq->curr); @@ -4413,7 +4412,7 @@ migration_call(struct notifier_block *nf remove_cpu(cpu); return_task(idle, 1); idle->static_prio = MAX_PRIO; - __setscheduler(idle, SCHED_NORMAL, 0); + __setscheduler(idle, rq, SCHED_NORMAL, 0); idle->prio = PRIO_LIMIT; set_rq_task(rq, idle); update_rq_clock(rq); @@ -5904,7 +5903,7 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } -void __init sched_init(void) +void sched_init(void) { int i; int highest_cpu = 0; @@ -6026,13 +6025,13 @@ void normalize_rt_tasks(void) rq = __task_grq_lock(p); update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued_only(p, rq); if (queued) dequeue_task(p); - __setscheduler(p, SCHED_NORMAL, 0); + __setscheduler(p, rq, SCHED_NORMAL, 0); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } __task_grq_unlock();