Don't unnecessarily preempt for a task on the wrong CPU. Cope with worker threads trying to wake themselves up due to shifting CPUs on suspend by reactivating it, instead of hitting the BUG_ON Wrap timer jiffies at 10 seconds instead of 5 minutes since 32 bit load averages don't work until the first timer wrap. Remove the last_task logic as it wasn't providing any significant performance advantage. Change the locality logic to try to reschedule on the exact same logical core instead of assuming scheduling on a sibling core or sibling thread is equivalent. This allows CPUs with a "turbo" mode (such as i7) to use that more often by using one CPU more than spreading out, and allows ondemand cpu frequency scaling to ramp up more easily when a task stays on the same CPU. It increases throughput on threaded CPUs when lightly loaded, and may offer both performance and power saving advantages on all SMP topologies with cpu frequency scaling. -ck --- include/linux/jiffies.h | 2 - include/linux/sched.h | 2 - kernel/sched_bfs.c | 76 ++++++++++++++++++++---------------------------- 3 files changed, 34 insertions(+), 46 deletions(-) Index: linux-2.6.35.9-bfs/include/linux/jiffies.h =================================================================== --- linux-2.6.35.9-bfs.orig/include/linux/jiffies.h 2010-02-25 21:51:52.000000000 +1100 +++ linux-2.6.35.9-bfs/include/linux/jiffies.h 2010-12-16 15:51:46.381737658 +1100 @@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void) * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ -#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) +#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) /* * Change timeval to jiffies, trying to avoid the Index: linux-2.6.35.9-bfs/include/linux/sched.h =================================================================== --- linux-2.6.35.9-bfs.orig/include/linux/sched.h 2010-12-16 15:51:05.000000000 +1100 +++ linux-2.6.35.9-bfs/include/linux/sched.h 2010-12-16 15:51:46.382737665 +1100 @@ -1541,7 +1541,7 @@ static inline void tsk_cpus_current(stru static inline void print_scheduler_version(void) { - printk(KERN_INFO"BFS CPU scheduler v0.357 by Con Kolivas.\n"); + printk(KERN_INFO"BFS CPU scheduler v0.360 by Con Kolivas.\n"); } static inline int iso_task(struct task_struct *p) Index: linux-2.6.35.9-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.35.9-bfs.orig/kernel/sched_bfs.c 2010-12-16 15:51:05.245443794 +1100 +++ linux-2.6.35.9-bfs/kernel/sched_bfs.c 2010-12-16 15:51:46.385737686 +1100 @@ -188,7 +188,6 @@ struct rq { u64 nohz_stamp; unsigned char in_nohz_recently; #endif - struct task_struct *last_task; #endif struct task_struct *curr, *idle; @@ -734,19 +733,12 @@ static int suitable_idle_cpus(struct tas static void resched_task(struct task_struct *p); -/* - * last_task stores the last non-idle task scheduled on the local rq for - * cache warmth testing. - */ -static inline void set_last_task(struct rq *rq, struct task_struct *p) -{ - rq->last_task = p; -} - -#define CPUIDLE_CACHE_BUSY (1) -#define CPUIDLE_DIFF_CPU (2) -#define CPUIDLE_THREAD_BUSY (4) -#define CPUIDLE_DIFF_NODE (8) +#define CPUIDLE_DIFF_THREAD (1) +#define CPUIDLE_DIFF_CORE (2) +#define CPUIDLE_CACHE_BUSY (4) +#define CPUIDLE_DIFF_CPU (8) +#define CPUIDLE_THREAD_BUSY (16) +#define CPUIDLE_DIFF_NODE (32) /* * The best idle CPU is chosen according to the CPUIDLE ranking above where the @@ -799,27 +791,28 @@ static void resched_best_idle(struct tas } tmp_rq = cpu_rq(cpu_tmp); - if (rq->cpu_locality[cpu_tmp]) { - /* Check rq->last_task hasn't been dereferenced */ - if (rq->last_task && p != rq->last_task) { #ifdef CONFIG_NUMA - if (rq->cpu_locality[cpu_tmp] > 1) - ranking |= CPUIDLE_DIFF_NODE; + if (rq->cpu_locality[cpu_tmp] > 3) + ranking |= CPUIDLE_DIFF_NODE; + else #endif - ranking |= CPUIDLE_DIFF_CPU; - } - } + if (rq->cpu_locality[cpu_tmp] > 2) + ranking |= CPUIDLE_DIFF_CPU; #ifdef CONFIG_SCHED_MC + if (rq->cpu_locality[cpu_tmp] == 2) + ranking |= CPUIDLE_DIFF_CORE; if (!(tmp_rq->cache_idle(cpu_tmp))) ranking |= CPUIDLE_CACHE_BUSY; #endif #ifdef CONFIG_SCHED_SMT + if (rq->cpu_locality[cpu_tmp] == 1) + ranking |= CPUIDLE_DIFF_THREAD; if (!(tmp_rq->siblings_idle(cpu_tmp))) ranking |= CPUIDLE_THREAD_BUSY; #endif if (ranking < best_ranking) { best_cpu = cpu_tmp; - if (ranking <= 1) + if (ranking == 0) break; best_ranking = ranking; } @@ -836,11 +829,11 @@ static inline void resched_suitable_idle /* * The cpu cache locality difference between CPUs is used to determine how far - * to offset the virtual deadline. "One" difference in locality means that one + * to offset the virtual deadline. <2 difference in locality means that one * timeslice difference is allowed longer for the cpu local tasks. This is * enough in the common case when tasks are up to 2* number of CPUs to keep * tasks within their shared cache CPUs only. CPUs on different nodes or not - * even in this domain (NUMA) have "3" difference, allowing 4 times longer + * even in this domain (NUMA) have "4" difference, allowing 4 times longer * deadlines before being taken onto another cpu, allowing for 2* the double * seen by separate CPUs above. * Simple summary: Virtual deadlines are equal on shared cache CPUs, double @@ -849,12 +842,11 @@ static inline void resched_suitable_idle static inline int cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) { - /* Check rq->last_task hasn't been dereferenced */ - if (likely(rq->last_task)) { - if (rq->last_task == p) - return 0; - } - return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p); + int locality = rq->cpu_locality[cpu_of(task_rq)] - 2; + + if (locality > 0) + return task_timeslice(p) << locality; + return 0; } #else /* CONFIG_SMP */ static inline void inc_qnr(void) @@ -892,10 +884,6 @@ cache_distance(struct rq *task_rq, struc { return 0; } - -static inline void set_last_task(struct rq *rq, struct task_struct *p) -{ -} #endif /* CONFIG_SMP */ /* @@ -1285,10 +1273,10 @@ static void try_preempt(struct task_stru return; } - if (online_cpus(p)) + if (likely(online_cpus(p))) cpus_and(tmp, cpu_online_map, p->cpus_allowed); else - (cpumask_copy(&tmp, &cpu_online_map)); + return; latest_deadline = 0; highest_prio = -1; @@ -2677,7 +2665,7 @@ need_resched_nonpreemptible: prev->last_ran = rq->clock; /* Task changed affinity off this CPU */ - if (needs_other_cpu(prev, cpu)) + if (unlikely(!cpu_isset(cpu, prev->cpus_allowed))) resched_suitable_idle(prev); else if (!deactivate) { if (!queued_notrunning()) { @@ -2720,8 +2708,6 @@ need_resched_nonpreemptible: sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); - if (prev != idle) - set_last_task(rq, prev); set_rq_task(rq, next); grq.nr_switches++; prev->oncpu = 0; @@ -6596,10 +6582,12 @@ void __init sched_init_smp(void) cpumask_set_cpu(other_cpu, &rq->cache_siblings); } #endif - if (sd->level <= SD_LV_MC) - locality = 0; - else if (sd->level <= SD_LV_NODE) + if (sd->level <= SD_LV_SIBLING) locality = 1; + else if (sd->level <= SD_LV_MC) + locality = 2; + else if (sd->level <= SD_LV_NODE) + locality = 3; else continue; @@ -6705,7 +6693,7 @@ void __init sched_init(void) if (i == j) rq->cpu_locality[j] = 0; else - rq->cpu_locality[j] = 3; + rq->cpu_locality[j] = 4; } } #endif