BIG BUG FIX - Deactivate was not reset to 0 on need_resched_nonpreemptible This could have led to all sorts of tasks disappearing and wedging. Change task_queued check back to simple one. The potential for queued not running was not real. Get rid of local queueing on rq since it's no longer needed. Slight locking optimisation in schedule(). Get rid of scheduling from idle task check. Only update rq clock when task will be activated on ttwu (performance gain). Add missing perf counter calls. old_state does nothing in ttwu; remove it, but cast p->state to same type. We know this_rq on entry to try_preempt so pass it over instead of re-evaluating. In try_preempt we *might* have been checking a cpu the task couldn't schedule on, forcing an inappropriate reschedule. sched_init oopses on high cpu numbers on kvm due to reaping of memory, so make it not __init cause I can't be arsed putting in kzalloc calls. inline a few one liners and single call site functions. Add rq_last_ran to locally store the last_ran value on the runqueue allowing safe update without taking grq lock. Task is always task_running in task_running_tick so no need to test for it. Change switch_count to unsigned long to match mainline, along with other __schedule() changes to match. Inline __schedule into schedule as per mainline. __schedule is no longer used. Call set_rq_task to copy all the rq_ fields from init_idle. Indentation changes. Remove timestamp, load_weight and rt_nr_cpus_allowed, now redundant variables. Fix UP build of 234 for 240 --- kernel/sched_bfs.c | 247 ++++++++++++++++++++++++++--------------------------- 1 file changed, 123 insertions(+), 124 deletions(-) Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-28 11:27:32.314473654 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-28 12:36:58.278472258 +1000 @@ -184,12 +184,12 @@ struct rq { struct task_struct *curr, *idle; struct mm_struct *prev_mm; - struct list_head queue; /* Place to store currently running task */ /* Stored data about rq->curr to work outside grq lock */ unsigned long rq_deadline; unsigned int rq_policy; int rq_time_slice; + u64 rq_last_ran; int rq_prio; /* Accurate timekeeping data */ @@ -417,12 +417,12 @@ static inline void task_grq_unlock(unsig * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -int grunqueue_is_locked(void) +inline int grunqueue_is_locked(void) { return spin_is_locked(&grq.lock); } -void grq_unlock_wait(void) +inline void grq_unlock_wait(void) __releases(grq.lock) { smp_mb(); /* spin-unlock-wait is not a full memory barrier */ @@ -504,11 +504,6 @@ static inline int task_queued(struct tas return (!list_empty(&p->run_list)); } -static inline int task_queued_only(struct task_struct *p) -{ - return (!list_empty(&p->run_list) && !task_running(p)); -} - /* * Removing from the global runqueue. Enter with grq locked. */ @@ -646,7 +641,7 @@ static inline void clear_cpuidle_map(uns } /* Always called from a busy cpu on UP */ -static int suitable_idle_cpus(struct task_struct *p) +static inline int suitable_idle_cpus(struct task_struct *p) { return 0; } @@ -702,7 +697,10 @@ static int effective_prio(struct task_st */ static void activate_task(struct task_struct *p, struct rq *rq) { - u64 now = rq->clock; + u64 now; + + update_rq_clock(rq); + now = rq->clock; /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -712,11 +710,10 @@ static void activate_task(struct task_st if (unlikely(prof_on == SLEEP_PROFILING)) { if (p->state == TASK_UNINTERRUPTIBLE) profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (now - p->timestamp) >> 20); + (now - p->last_ran) >> 20); } p->prio = effective_prio(p); - p->timestamp = now; if (task_contributes_to_load(p)) grq.nr_uninterruptible--; enqueue_task(p); @@ -739,6 +736,7 @@ static inline void deactivate_task(struc void set_task_cpu(struct task_struct *p, unsigned int cpu) { trace_sched_migrate_task(p, cpu); + perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); /* * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be * successfuly executed on another CPU. We must ensure that updates of @@ -757,7 +755,6 @@ static inline void take_task(struct rq * { set_task_cpu(p, rq->cpu); dequeue_task(p); - list_add(&p->run_list, &rq->queue); dec_qnr(); } @@ -767,7 +764,6 @@ static inline void take_task(struct rq * */ static inline void return_task(struct task_struct *p, int deactivate) { - list_del_init(&p->run_list); if (deactivate) deactivate_task(p); else { @@ -1034,27 +1030,25 @@ static inline int task_preempts_curr(str /* * Wake up *any* suitable cpu to schedule this task. */ -static void try_preempt(struct task_struct *p) +static void try_preempt(struct task_struct *p, struct rq *this_rq) { - struct rq *highest_prio_rq, *this_rq; unsigned long latest_deadline, cpu; + struct rq *highest_prio_rq; int highest_prio; cpumask_t tmp; - /* Try the task's previous rq first and as a fallback */ - this_rq = task_rq(p); + cpus_and(tmp, cpu_online_map, p->cpus_allowed); - if (cpu_isset(this_rq->cpu, p->cpus_allowed)) { + /* Use this_rq as fallback */ + if (likely(cpu_isset(this_rq->cpu, tmp))) { highest_prio_rq = this_rq; /* If this_rq is idle, use that. */ - if (rq_idle(this_rq)) + if (rq_idle(highest_prio_rq)) goto found_rq; } else - highest_prio_rq = cpu_rq(any_online_cpu(p->cpus_allowed)); - latest_deadline = this_rq->rq_deadline; - highest_prio = this_rq->rq_prio; - - cpus_and(tmp, cpu_online_map, p->cpus_allowed); + highest_prio_rq = cpu_rq(any_online_cpu(tmp)); + latest_deadline = highest_prio_rq->rq_deadline; + highest_prio = highest_prio_rq->rq_prio; for_each_cpu_mask(cpu, tmp) { struct rq *rq; @@ -1069,12 +1063,11 @@ static void try_preempt(struct task_stru } rq_prio = rq->rq_prio; - if (rq_prio > highest_prio || - (rq_prio == highest_prio && - time_after(rq->rq_deadline, latest_deadline))) { - highest_prio = rq_prio; - latest_deadline = rq->rq_deadline; - highest_prio_rq = rq; + if (rq_prio > highest_prio || (rq_prio == highest_prio && + time_after(rq->rq_deadline, latest_deadline))) { + highest_prio = rq_prio; + latest_deadline = rq->rq_deadline; + highest_prio_rq = rq; } } @@ -1124,20 +1117,19 @@ static int try_to_wake_up(struct task_st { unsigned long flags; int success = 0; - long old_state; struct rq *rq; - rq = time_task_grq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out_unlock; - /* - * Note this catches tasks that are running and queued, but returns - * false during the context switch when they're running and no - * longer queued. + * No need to do time_lock_grq as we only need to update the rq clock + * if we activate the task */ - if (task_queued(p)) + rq = task_grq_lock(p, &flags); + + /* state is a volatile long, どうして、分からない */ + if (!(unsigned int)p->state & state) + goto out_unlock; + + if (task_queued(p) || task_running(p)) goto out_running; activate_task(p, rq); @@ -1148,7 +1140,7 @@ static int try_to_wake_up(struct task_st * instead waiting for current to deschedule. */ if (!sync || (sync && suitable_idle_cpus(p))) - try_preempt(p); + try_preempt(p, rq); success = 1; out_running: @@ -1259,7 +1251,7 @@ void wake_up_new_task(struct task_struct unsigned long flags; struct rq *rq; - rq = time_task_grq_lock(p, &flags); ; + rq = task_grq_lock(p, &flags); ; parent = p->parent; BUG_ON(p->state != TASK_RUNNING); set_task_cpu(p, task_cpu(parent)); @@ -1274,7 +1266,7 @@ void wake_up_new_task(struct task_struct */ resched_task(parent); } else - try_preempt(p); + try_preempt(p, rq); task_grq_unlock(&flags); } @@ -1718,7 +1710,7 @@ static void pc_user_time(struct rq *rq, static void update_cpu_clock(struct rq *rq, struct task_struct *p, int tick) { - long time_diff = rq->clock - p->last_ran; + long time_diff = rq->clock - rq->rq_last_ran; long account_ns = rq->clock - rq->timekeep_clock; struct task_struct *idle = rq->idle; unsigned long account_pc; @@ -1760,7 +1752,7 @@ update_cpu_clock(struct rq *rq, struct t /* time_slice accounting is done in usecs to avoid overflow on 32bit */ if (rq->rq_policy != SCHED_FIFO && p != idle) rq->rq_time_slice -= time_diff / 1000; - p->last_ran = rq->timekeep_clock = rq->clock; + rq->rq_last_ran = rq->timekeep_clock = rq->clock; } /* @@ -1775,7 +1767,7 @@ static u64 do_task_delta_exec(struct tas if (p == rq->curr) { update_rq_clock(rq); - ns = rq->clock - p->last_ran; + ns = rq->clock - rq->rq_last_ran; if ((s64)ns < 0) ns = 0; } @@ -2039,10 +2031,8 @@ static void task_running_tick(struct rq /* p->time_slice <= 0. We only modify task_struct under grq lock */ grq_lock(); p = rq->curr; - if (likely(task_running(p))) { - requeue_task(p); - set_tsk_need_resched(p); - } + requeue_task(p); + set_tsk_need_resched(p); grq_unlock(); } @@ -2065,6 +2055,7 @@ void scheduler_tick(void) task_running_tick(rq); else no_iso_tick(); + perf_counter_task_tick(rq->curr, cpu); } notrace unsigned long get_parent_ip(unsigned long addr) @@ -2282,6 +2273,7 @@ static inline void set_rq_task(struct rq { rq->rq_time_slice = p->time_slice; rq->rq_deadline = p->deadline; + rq->rq_last_ran = p->last_ran; rq->rq_policy = p->policy; rq->rq_prio = p->prio; } @@ -2289,16 +2281,20 @@ static inline void set_rq_task(struct rq /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next, *idle; - int deactivate = 0, cpu; - long *switch_count; + unsigned long *switch_count; + int deactivate, cpu; struct rq *rq; u64 now; +need_resched: + preempt_disable(); + cpu = smp_processor_id(); - rq = this_rq(); + rq = cpu_rq(cpu); + idle = rq->idle; rcu_qsctr_inc(cpu); prev = rq->curr; switch_count = &prev->nivcsw; @@ -2306,22 +2302,15 @@ asmlinkage void __sched __schedule(void) release_kernel_lock(prev); need_resched_nonpreemptible: + deactivate = 0; schedule_debug(prev); - idle = rq->idle; - /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. - */ - if (unlikely(prev == idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - } - grq_lock_irq(); + local_irq_disable(); update_rq_clock(rq); now = rq->clock; update_cpu_clock(rq, prev, 0); + grq_lock(); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -2351,23 +2340,24 @@ need_resched_nonpreemptible: schedstat_inc(rq, sched_goidle); } + prefetch(next); + prefetch_stack(next); + if (task_idle(next)) set_cpuidle_map(cpu); else clear_cpuidle_map(cpu); - prefetch(next); - prefetch_stack(next); - - prev->timestamp = prev->last_ran = now; + prev->last_ran = now; if (likely(prev != next)) { - set_rq_task(rq, next); - sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, next, cpu); + + set_rq_task(rq, next); grq.nr_switches++; - next->oncpu = 1; prev->oncpu = 0; + next->oncpu = 1; rq->curr = next; ++*switch_count; @@ -2378,21 +2368,15 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); + idle = rq->idle; } else grq_unlock_irq(); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} - -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + if (need_resched()) + goto need_resched; } EXPORT_SYMBOL(schedule); @@ -2869,7 +2853,7 @@ void rt_mutex_setprio(struct task_struct rq = time_task_grq_lock(p, &flags); oldprio = p->prio; - queued = task_queued_only(p); + queued = task_queued(p); if (queued) dequeue_task(p); p->prio = prio; @@ -2877,7 +2861,7 @@ void rt_mutex_setprio(struct task_struct resched_task(p); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } task_grq_unlock(&flags); @@ -2889,7 +2873,7 @@ void rt_mutex_setprio(struct task_struct * Adjust the deadline for when the priority is to change, before it's * changed. */ -static void adjust_deadline(struct task_struct *p, int new_prio) +static inline void adjust_deadline(struct task_struct *p, int new_prio) { p->deadline += (prio_ratios[USER_PRIO(new_prio)] - pratio(p)) * rr_interval * HZ / 1000 / 100; @@ -2919,7 +2903,7 @@ void set_user_nice(struct task_struct *p p->static_prio = new_static; goto out_unlock; } - queued = task_queued_only(p); + queued = task_queued(p); /* * If p is actually running, we don't need to do anything when * changing the priority because the grq is unaffected. @@ -2933,7 +2917,7 @@ void set_user_nice(struct task_struct *p if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } /* Just resched the task, schedule() will know what to do. */ @@ -3062,9 +3046,10 @@ static inline struct task_struct *find_p } /* Actually do priority change: must hold grq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +static void +__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) { - BUG_ON(task_queued_only(p)); + BUG_ON(task_queued(p)); p->policy = policy; p->rt_priority = prio; @@ -3207,14 +3192,14 @@ recheck: goto recheck; } update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued(p); if (queued) dequeue_task(p); oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); + __setscheduler(p, rq, policy, param->sched_priority); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } __task_grq_unlock(); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -3845,13 +3830,11 @@ void __cpuinit init_idle(struct task_str unsigned long flags; time_grq_lock(rq, &flags); - idle->timestamp = idle->last_ran = rq->clock; + idle->last_ran = rq->clock; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ - idle->prio = rq->rq_prio = PRIO_LIMIT; - rq->rq_deadline = idle->deadline; - rq->rq_policy = idle->policy; - rq->rq_time_slice = idle->time_slice; + idle->prio = PRIO_LIMIT; + set_rq_task(rq, idle); idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -4039,10 +4022,9 @@ int set_cpus_allowed_ptr(struct task_str goto out; } - queued = task_queued_only(p); + queued = task_queued(p); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt_nr_cpus_allowed = cpumask_weight(new_mask); /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) @@ -4057,7 +4039,7 @@ int set_cpus_allowed_ptr(struct task_str out: if (queued) - try_preempt(p); + try_preempt(p, rq); task_grq_unlock(&flags); if (running_wrong) @@ -4088,7 +4070,7 @@ void sched_idle_next(void) */ time_grq_lock(rq, &flags); - __setscheduler(idle, SCHED_FIFO, MAX_RT_PRIO - 1); + __setscheduler(idle, rq, SCHED_FIFO, MAX_RT_PRIO - 1); activate_idle_task(idle); set_tsk_need_resched(rq->curr); @@ -4318,8 +4300,6 @@ static void remove_cpu(unsigned long cpu if (cpus_empty(cpus_remaining)) { cpumask_copy(&p->unplugged_mask, &p->cpus_allowed); cpumask_copy(&p->cpus_allowed, &cpu_possible_map); - p->rt_nr_cpus_allowed = - cpumask_weight(&cpu_possible_map); } } while_each_thread(t, p); @@ -4352,8 +4332,6 @@ static void add_cpu(unsigned long cpu) * set all the cpus back. */ cpumask_copy(&p->cpus_allowed, &p->unplugged_mask); - p->rt_nr_cpus_allowed = - cpumask_weight(&p->cpus_allowed); cpus_clear(p->unplugged_mask); } } @@ -4413,7 +4391,7 @@ migration_call(struct notifier_block *nf remove_cpu(cpu); return_task(idle, 1); idle->static_prio = MAX_PRIO; - __setscheduler(idle, SCHED_NORMAL, 0); + __setscheduler(idle, rq, SCHED_NORMAL, 0); idle->prio = PRIO_LIMIT; set_rq_task(rq, idle); update_rq_clock(rq); @@ -5904,7 +5882,7 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } -void __init sched_init(void) +void sched_init(void) { int i; int highest_cpu = 0; @@ -5913,16 +5891,16 @@ void __init sched_init(void) for (i = 1 ; i < PRIO_RANGE ; i++) prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; + spin_lock_init(&grq.lock); #ifdef CONFIG_SMP init_defrootdomain(); cpus_clear(grq.cpu_idle_map); + grq.qnr = 0; #endif - spin_lock_init(&grq.lock); for_each_possible_cpu(i) { struct rq *rq; rq = cpu_rq(i); - INIT_LIST_HEAD(&rq->queue); rq->rq_deadline = 0; rq->rq_prio = 0; rq->cpu = i; @@ -6026,13 +6004,13 @@ void normalize_rt_tasks(void) rq = __task_grq_lock(p); update_rq_clock(rq); - queued = task_queued_only(p); + queued = task_queued(p); if (queued) dequeue_task(p); - __setscheduler(p, SCHED_NORMAL, 0); + __setscheduler(p, rq, SCHED_NORMAL, 0); if (queued) { enqueue_task(p); - try_preempt(p); + try_preempt(p, rq); } __task_grq_unlock(); Index: linux-2.6.31-bfs/include/linux/sched.h =================================================================== --- linux-2.6.31-bfs.orig/include/linux/sched.h 2009-09-28 11:27:32.302471533 +1000 +++ linux-2.6.31-bfs/include/linux/sched.h 2009-09-28 11:30:10.346471389 +1000 @@ -1027,16 +1027,14 @@ struct task_struct { int lock_depth; /* BKL lock depth */ int oncpu; - int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; int time_slice, first_time_slice; unsigned long deadline; struct list_head run_list; unsigned int rt_priority; - unsigned long long timestamp, last_ran; + u64 last_ran; u64 sched_time; /* sched_clock time spent running */ - int rt_nr_cpus_allowed; unsigned long rt_timeout; #ifdef CONFIG_PREEMPT_NOTIFIERS Index: linux-2.6.31-bfs/kernel/fork.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/fork.c 2009-09-28 11:27:32.308472067 +1000 +++ linux-2.6.31-bfs/kernel/fork.c 2009-09-28 11:30:10.347471744 +1000 @@ -1193,7 +1193,6 @@ static struct task_struct *copy_process( * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; - p->rt_nr_cpus_allowed = current->rt_nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); Index: linux-2.6.31-bfs/kernel/kthread.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/kthread.c 2009-09-28 11:27:32.321471057 +1000 +++ linux-2.6.31-bfs/kernel/kthread.c 2009-09-28 11:30:10.384472487 +1000 @@ -170,7 +170,6 @@ void kthread_bind(struct task_struct *k, } set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt_nr_cpus_allowed = 1; k->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind);