Improve the gradation when setting the cache_distance, taking into account different cores and different SMT siblings. Try harder to reschedule tasks on the last CPU when doing resched_best_idle, by not treating all threads and siblings as the same CPU. This improves throughput on lightly loaded threaded CPUs (eg i7). Remove the last_task as it was not significantly improving performance. -ck Index: linux-2.6.36-ck2/kernel/sched_bfs.c =================================================================== --- linux-2.6.36-ck2.orig/kernel/sched_bfs.c 2010-12-13 09:11:26.468427288 +1100 +++ linux-2.6.36-ck2/kernel/sched_bfs.c 2010-12-14 18:00:38.178783004 +1100 @@ -190,7 +190,6 @@ u64 nohz_stamp; unsigned char in_nohz_recently; #endif - struct task_struct *last_task; #endif struct task_struct *curr, *idle; @@ -762,19 +761,12 @@ static void resched_task(struct task_struct *p); -/* - * last_task stores the last non-idle task scheduled on the local rq for - * cache warmth testing. - */ -static inline void set_last_task(struct rq *rq, struct task_struct *p) -{ - rq->last_task = p; -} - -#define CPUIDLE_CACHE_BUSY (1) -#define CPUIDLE_DIFF_CPU (2) -#define CPUIDLE_THREAD_BUSY (4) -#define CPUIDLE_DIFF_NODE (8) +#define CPUIDLE_DIFF_THREAD (1) +#define CPUIDLE_DIFF_CORE (2) +#define CPUIDLE_CACHE_BUSY (4) +#define CPUIDLE_DIFF_CPU (8) +#define CPUIDLE_THREAD_BUSY (16) +#define CPUIDLE_DIFF_NODE (32) /* * The best idle CPU is chosen according to the CPUIDLE ranking above where the @@ -828,26 +820,29 @@ tmp_rq = cpu_rq(cpu_tmp); if (rq->cpu_locality[cpu_tmp]) { - /* Check rq->last_task hasn't been dereferenced */ - if (rq->last_task && p != rq->last_task) { #ifdef CONFIG_NUMA - if (rq->cpu_locality[cpu_tmp] > 1) - ranking |= CPUIDLE_DIFF_NODE; + if (rq->cpu_locality[cpu_tmp] > 3) + ranking |= CPUIDLE_DIFF_NODE; + else #endif + if (rq->cpu_locality[cpu_tmp] > 2) ranking |= CPUIDLE_DIFF_CPU; - } } #ifdef CONFIG_SCHED_MC + if (rq->cpu_locality[cpu_tmp] == 2) + ranking |= CPUIDLE_DIFF_CORE; if (!(tmp_rq->cache_idle(cpu_tmp))) ranking |= CPUIDLE_CACHE_BUSY; #endif #ifdef CONFIG_SCHED_SMT + if (rq->cpu_locality[cpu_tmp] == 1) + ranking |= CPUIDLE_DIFF_THREAD; if (!(tmp_rq->siblings_idle(cpu_tmp))) ranking |= CPUIDLE_THREAD_BUSY; #endif if (ranking < best_ranking) { best_cpu = cpu_tmp; - if (ranking <= 1) + if (ranking == 0) break; best_ranking = ranking; } @@ -877,11 +872,6 @@ static inline int cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) { - /* Check rq->last_task hasn't been dereferenced */ - if (likely(rq->last_task)) { - if (rq->last_task == p) - return 0; - } return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p); } #else /* CONFIG_SMP */ @@ -920,10 +910,6 @@ { return 0; } - -static inline void set_last_task(struct rq *rq, struct task_struct *p) -{ -} #endif /* CONFIG_SMP */ /* @@ -2813,8 +2799,6 @@ sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); - if (prev != idle) - set_last_task(rq, prev); set_rq_task(rq, next); grq.nr_switches++; prev->oncpu = 0; @@ -6566,10 +6550,12 @@ cpumask_set_cpu(other_cpu, &rq->cache_siblings); } #endif - if (sd->level <= SD_LV_MC) - locality = 0; - else if (sd->level <= SD_LV_NODE) + if (sd->level <= SD_LV_SIBLING) locality = 1; + else if (sd->level <= SD_LV_MC) + locality = 2; + else if (sd->level <= SD_LV_NODE) + locality = 3; else continue; @@ -6675,7 +6661,7 @@ if (i == j) rq->cpu_locality[j] = 0; else - rq->cpu_locality[j] = 3; + rq->cpu_locality[j] = 4; } } #endif