Looks like I was right about it being a flaky way to change cpus. Change the set_cpus_allowed_ptr function to conditionally resched only. Also the waking up of random runqueues that are idle means they were racing to try and grab the (wrong) task. Disable waking up idle runqueues just if qnr tasks exist. It means we need to make sure that if a task is being descheduled with affinity for only one cpu, that that cpu is woken from idle. There is an incredibly rare scenario where the running task is being descheduled and the only cpu it can be scheduled to is idle so check for that in schedule(). Carefully reintroduce minimal grq lock tweaks so as to make sure to only ever look up task_rq under grq lock. Suspend/resume fixes yay \o/. When the idle task was being descheduled after the cpu was disabled, it was still "queued" so fix that. Also the runqueue local task data rq_* wasn't being updated. The rq_policy field wasn't being updated on changing tasks. When setting the idle field in cpu_idle_map, only set it when the idle task is not queued fifo for disabling a cpu. Remove a redundant resched_task. --- kernel/sched_bfs.c | 170 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 106 insertions(+), 64 deletions(-) Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-22 10:14:09.611476564 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-22 11:53:34.756228973 +1000 @@ -354,8 +354,8 @@ static inline void grq_unlock_irq(void) static inline void grq_lock_irqsave(unsigned long *flags) __acquires(grq.lock) { - smp_mb(); - spin_lock_irqsave(&grq.lock, *flags); + local_irq_save(*flags); + grq_lock(); } static inline void grq_unlock_irqrestore(unsigned long *flags) @@ -381,6 +381,29 @@ static inline struct rq return rq; } +static inline struct rq +*task_grq_lock_irq(struct task_struct *p) + __acquires(grq.lock) +{ + grq_lock_irq(); + return task_rq(p); +} + +static inline void +time_task_grq_lock_irq(struct task_struct *p) + __acquires(grq.lock) +{ + struct rq *rq = task_grq_lock_irq(p); + update_rq_clock(rq); +} + +static inline void +task_grq_unlock_irq(void) + __releases(grq.lock) +{ + grq_unlock_irq(); +} + static inline void task_grq_unlock(unsigned long *flags) __releases(grq.lock) { @@ -409,8 +432,8 @@ void grq_unlock_wait(void) static inline void time_grq_lock(struct rq *rq, unsigned long *flags) __acquires(grq.lock) { - spin_lock_irqsave(&grq.lock, *flags); - update_rq_clock(rq); + local_irq_save(*flags); + time_lock_grq(rq); } static inline struct rq *__task_grq_lock(struct task_struct *p) @@ -574,7 +597,23 @@ static inline int queued_notrunning(void { return grq.qnr; } -#else + +static inline void set_cpuidle_map(unsigned long cpu) +{ + cpu_set(cpu, grq.cpu_idle_map); +} + +static inline void clear_cpuidle_map(unsigned long cpu) +{ + cpu_clear(cpu, grq.cpu_idle_map); +} + +static int suitable_idle_cpus(struct task_struct *p) +{ + return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map)); +} + +#else /* CONFIG_SMP */ static inline void inc_qnr(void) { } @@ -587,7 +626,21 @@ static inline int queued_notrunning(void { return grq.nr_running; } -#endif + +static inline void set_cpuidle_map(unsigned long cpu) +{ +} + +static inline void clear_cpuidle_map(unsigned long cpu) +{ +} + +/* Always called from a busy cpu on UP */ +static int suitable_idle_cpus(struct task_struct *p) +{ + return 0; +} +#endif /* CONFIG_SMP */ /* * activate_idle_task - move idle task to the _front_ of runqueue. @@ -945,6 +998,7 @@ EXPORT_SYMBOL_GPL(kick_process); #endif #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) +#define task_idle(p) ((p)->prio == PRIO_LIMIT) /* * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the @@ -1038,18 +1092,6 @@ void task_oncpu_function_call(struct tas preempt_enable(); } -#ifdef CONFIG_SMP -static int suitable_idle_cpus(struct task_struct *p) -{ - return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map)); -} -#else -static int suitable_idle_cpus(struct task_struct *p) -{ - return 0; -} -#endif - /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -1175,8 +1217,7 @@ void sched_fork(struct task_struct *p, i * matter since that's the same as being 0. current's time_slice is * actually in rq_time_slice when it's running. */ - local_irq_disable(); - rq = task_rq(current); + rq = task_grq_lock_irq(current); if (likely(rq->rq_time_slice > 0)) { rq->rq_time_slice /= 2; /* @@ -1186,7 +1227,7 @@ void sched_fork(struct task_struct *p, i p->first_time_slice = 1; } p->time_slice = rq->rq_time_slice; - local_irq_enable(); + task_grq_unlock_irq(); out: put_cpu(); } @@ -2008,11 +2049,8 @@ void scheduler_tick(void) update_cpu_clock(rq, rq->curr, 1); if (!rq_idle(rq)) task_running_tick(rq); - else { + else no_iso_tick(); - if (unlikely(queued_notrunning())) - set_tsk_need_resched(rq->idle); - } } notrace unsigned long get_parent_ip(unsigned long addr) @@ -2181,26 +2219,15 @@ out: return edt; } -#ifdef CONFIG_SMP -static inline void set_cpuidle_map(unsigned long cpu) +static inline void resched_suitable_idle(struct task_struct *p) { - cpu_set(cpu, grq.cpu_idle_map); -} - -static inline void clear_cpuidle_map(unsigned long cpu) -{ - cpu_clear(cpu, grq.cpu_idle_map); -} + cpumask_t tmp; -#else /* CONFIG_SMP */ -static inline void set_cpuidle_map(unsigned long cpu) -{ -} + cpus_and(tmp, p->cpus_allowed, grq.cpu_idle_map); -static inline void clear_cpuidle_map(unsigned long cpu) -{ + if (!cpus_empty(tmp)) + wake_up_idle_cpu(first_cpu(tmp)); } -#endif /* !CONFIG_SMP */ /* * Print scheduling while atomic bug: @@ -2247,6 +2274,14 @@ static inline void schedule_debug(struct #endif } +static inline void set_rq_task(struct rq *rq, struct task_struct *p) +{ + rq->rq_time_slice = p->time_slice; + rq->rq_deadline = p->deadline; + rq->rq_policy = p->policy; + rq->rq_prio = p->prio; +} + /* * schedule() is the main scheduler function. */ @@ -2299,6 +2334,10 @@ need_resched_nonpreemptible: prev->deadline = rq->rq_deadline; check_deadline(prev); return_task(prev, deactivate); + /* Task changed affinity off this cpu */ + if (unlikely(!cpus_intersects(prev->cpus_allowed, + cpumask_of_cpu(cpu)))) + resched_suitable_idle(prev); } if (likely(queued_notrunning())) { @@ -2308,7 +2347,7 @@ need_resched_nonpreemptible: schedstat_inc(rq, sched_goidle); } - if (next == rq->idle) + if (task_idle(next)) set_cpuidle_map(cpu); else clear_cpuidle_map(cpu); @@ -2319,9 +2358,7 @@ need_resched_nonpreemptible: prev->timestamp = prev->last_ran = now; if (likely(prev != next)) { - rq->rq_time_slice = next->time_slice; - rq->rq_deadline = next->deadline; - rq->rq_prio = next->prio; + set_rq_task(rq, next); sched_info_switch(prev, next); grq.nr_switches++; @@ -3499,10 +3536,9 @@ SYSCALL_DEFINE0(sched_yield) { struct task_struct *p; - grq_lock_irq(); p = current; + time_task_grq_lock_irq(p); schedstat_inc(this_rq(), yld_count); - update_rq_clock(task_rq(p)); time_slice_expired(p); requeue_task(p); @@ -3982,7 +4018,7 @@ void wake_up_idle_cpu(int cpu) int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { unsigned long flags; - int running = 0; + int running_wrong = 0; int queued = 0; struct rq *rq; int ret = 0; @@ -4008,10 +4044,11 @@ int set_cpus_allowed_ptr(struct task_str if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - /* Reschedule the task, schedule() will know if it can keep running */ - if (task_running(p)) - running = 1; - else + if (task_running(p)) { + /* Task is running on the wrong cpu now, reschedule it. */ + set_tsk_need_resched(p); + running_wrong = 1; + } else set_task_cpu(p, cpumask_any_and(cpu_online_mask, new_mask)); out: @@ -4019,9 +4056,9 @@ out: try_preempt(p); task_grq_unlock(&flags); - /* This might be a flaky way of changing cpus! */ - if (running) - schedule(); + if (running_wrong) + _cond_resched(); + return ret; } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); @@ -4275,8 +4312,10 @@ static void remove_cpu(unsigned long cpu cpus_and(cpus_remaining, p->cpus_allowed, cpu_online_map); cpu_clear(cpu, cpus_remaining); if (cpus_empty(cpus_remaining)) { - p->unplugged_mask = p->cpus_allowed; - p->cpus_allowed = cpu_possible_map; + cpumask_copy(&p->unplugged_mask, &p->cpus_allowed); + cpumask_copy(&p->cpus_allowed, &cpu_possible_map); + p->rt_nr_cpus_allowed = + cpumask_weight(&cpu_possible_map); } } while_each_thread(t, p); @@ -4308,7 +4347,9 @@ static void add_cpu(unsigned long cpu) * them. Then clear the unplugged_mask as we've * set all the cpus back. */ - p->cpus_allowed = p->unplugged_mask; + cpumask_copy(&p->cpus_allowed, &p->unplugged_mask); + p->rt_nr_cpus_allowed = + cpumask_weight(&p->cpus_allowed); cpus_clear(p->unplugged_mask); } } @@ -4328,6 +4369,7 @@ static void add_cpu(unsigned long cpu) static int __cpuinit migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { + struct task_struct *idle; int cpu = (long)hcpu; unsigned long flags; struct rq *rq; @@ -4361,13 +4403,15 @@ migration_call(struct notifier_block *nf case CPU_DEAD_FROZEN: cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ rq = cpu_rq(cpu); + idle = rq->idle; /* Idle task back to normal (off runqueue, low prio) */ grq_lock_irq(); remove_cpu(cpu); - deactivate_task(rq->idle); - rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq->idle, SCHED_NORMAL, 0); - rq->idle->prio = PRIO_LIMIT; + return_task(idle, 1); + idle->static_prio = MAX_PRIO; + __setscheduler(idle, SCHED_NORMAL, 0); + idle->prio = PRIO_LIMIT; + set_rq_task(rq, idle); update_rq_clock(rq); grq_unlock_irq(); cpuset_unlock(); @@ -5982,8 +6026,6 @@ void normalize_rt_tasks(void) if (queued) dequeue_task(p); __setscheduler(p, SCHED_NORMAL, 0); - if (task_running(p)) - resched_task(p); if (queued) { enqueue_task(p); try_preempt(p);