Scalability improvements. Will help with >2 logical CPUs with complex topology such as NUMA, multiple shared caches and especially hyperthreading. This should make real cores be used before hyperthread siblings are used. --- kernel/sched_bfs.c | 191 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 170 insertions(+), 21 deletions(-) Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-11-03 13:02:59.782125724 +1100 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-11-03 17:47:25.003250696 +1100 @@ -217,6 +217,16 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; unsigned long *cpu_locality; /* CPU relative cache distance */ +#ifdef CONFIG_SCHED_SMT + int (*siblings_idle)(unsigned long cpu); + /* See if all smt siblings are idle */ + cpumask_t smt_siblings; +#endif +#ifdef CONFIG_SCHED_MC + int (*cache_idle)(unsigned long cpu); + /* See if all cache siblings are idle */ + cpumask_t cache_siblings; +#endif #endif u64 clock; @@ -648,14 +658,89 @@ static int suitable_idle_cpus(struct tas return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map)); } -static inline void resched_suitable_idle(struct task_struct *p) +static void resched_task(struct task_struct *p); + +#define CPUIDLE_CACHE_BUSY (1) +#define CPUIDLE_DIFF_CPU (2) +#define CPUIDLE_THREAD_BUSY (4) +#define CPUIDLE_DIFF_NODE (8) + +/* + * The best idle CPU is chosen according to the CPUIDLE ranking above where the + * lowest value would give the most suitable CPU to schedule p onto next. We + * iterate from the last CPU upwards instead of using for_each_cpu_mask so as + * to be able to break out immediately if the last CPU is idle. The order works + * out to be the following: + * + * Same CPU, idle or busy cache, idle threads. + * Other CPU, same cache, busy cache, idle threads. + * Same node, other CPU, other cache, idle cache, idle threads. + * Same node, other CPU, other cache, busy cache, idle threads. + * Same CPU, busy cache, busy threads. + * Other CPU, same cache, busy threads. + * Same node, other CPU, other cache, busy threads. + * Other node, other CPU, idle cache, idle threads. + * Other node, other CPU, busy cache, idle threads. + * Other node, other CPU, busy threads. + */ +static void resched_best_idle(struct task_struct *p) { - cpumask_t tmp; + unsigned long cpu_tmp, best_cpu, best_ranking; + cpumask_t tmpmask; + struct rq *rq; + int iterate; - cpus_and(tmp, p->cpus_allowed, grq.cpu_idle_map); + cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); + iterate = cpus_weight(tmpmask); + best_cpu = task_cpu(p); + /* + * Start below the last CPU and work up with next_cpu_nr as the last + * CPU might not be idle or affinity might not allow it. + */ + cpu_tmp = best_cpu - 1; + rq = cpu_rq(best_cpu); + best_ranking = ~0UL; - if (!cpus_empty(tmp)) - wake_up_idle_cpu(first_cpu(tmp)); + do { + unsigned long ranking; + struct rq *tmp_rq; + + cpu_tmp = next_cpu_nr(cpu_tmp, tmpmask); + if (cpu_tmp >= nr_cpu_ids) { + cpu_tmp = -1; + cpu_tmp = next_cpu_nr(cpu_tmp, tmpmask); + } + tmp_rq = cpu_rq(cpu_tmp); + + /* + * Note the base ranking works out to the unused CPUIDLE + * defines above. They're included, even though not used, for + * clarity. + */ + ranking = rq->cpu_locality[cpu_tmp] * 2; +#ifdef CONFIG_SCHED_MC + if (!(tmp_rq->cache_idle(cpu_tmp))) + ranking |= CPUIDLE_CACHE_BUSY; +#endif +#ifdef CONFIG_SCHED_SMT + if (!(tmp_rq->siblings_idle(cpu_tmp))) + ranking |= CPUIDLE_THREAD_BUSY; +#endif + if (ranking < best_ranking) { + best_cpu = cpu_tmp; + if (ranking <= 1) + break; + best_ranking = ranking; + } + } while (--iterate > 0); + + resched_task(cpu_rq(best_cpu)->curr); +} + +static inline void resched_suitable_idle(struct task_struct *p) +{ + if (suitable_idle_cpus(p)) + resched_best_idle(p); } /* @@ -1080,20 +1165,26 @@ EXPORT_SYMBOL_GPL(kick_process); * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt * between themselves, they cooperatively multitask. An idle rq scores as - * prio PRIO_LIMIT so it is always preempted. The offset_deadline will choose - * an idle runqueue that is closer cache-wise in preference. latest_deadline - * and highest_prio_rq are initialised only to silence the compiler. When + * prio PRIO_LIMIT so it is always preempted. latest_deadline and + * highest_prio_rq are initialised only to silence the compiler. When * all else is equal, still prefer this_rq. */ #ifdef CONFIG_SMP static void try_preempt(struct task_struct *p, struct rq *this_rq) { - unsigned long latest_deadline = 0, cpu; struct rq *highest_prio_rq = this_rq; - int highest_prio = -1; + unsigned long latest_deadline, cpu; + int highest_prio; cpumask_t tmp; + if (suitable_idle_cpus(p)) { + resched_best_idle(p); + return; + } + cpus_and(tmp, cpu_online_map, p->cpus_allowed); + latest_deadline = 0; + highest_prio = -1; for_each_cpu_mask(cpu, tmp) { unsigned long offset_deadline; @@ -1105,15 +1196,8 @@ static void try_preempt(struct task_stru if (rq_prio < highest_prio) continue; - offset_deadline = -cache_distance(this_rq, rq, p); - if (rq_prio != PRIO_LIMIT) - offset_deadline += rq->rq_deadline; - else - if (rq == this_rq) { - /* this_rq is idle, use that over everything */ - highest_prio_rq = rq; - goto out; - } + offset_deadline = rq->rq_deadline - + cache_distance(this_rq, rq, p); if (rq_prio > highest_prio || (time_after(offset_deadline, latest_deadline) || @@ -1128,7 +1212,6 @@ static void try_preempt(struct task_stru p->policy == SCHED_NORMAL && !time_before(p->deadline, latest_deadline))) return; -out: /* p gets to preempt highest_prio_rq->curr */ resched_task(highest_prio_rq->curr); return; @@ -5991,6 +6074,33 @@ static int update_runtime(struct notifie } } +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +/* + * Cheaper version of the below functions in case support for SMT and MC is + * compiled in but CPUs have no siblings. + */ +static int sole_cpu_idle(unsigned long cpu) +{ + return rq_idle(cpu_rq(cpu)); +} +#endif +#ifdef CONFIG_SCHED_SMT +/* All this CPU's SMT siblings are idle */ +static int siblings_cpu_idle(unsigned long cpu) +{ + return cpumask_subset(&(cpu_rq(cpu)->smt_siblings), + &grq.cpu_idle_map); +} +#endif +#ifdef CONFIG_SCHED_MC +/* All this CPU's shared cache siblings are idle */ +static int cache_cpu_idle(unsigned long cpu) +{ + return cpumask_subset(&(cpu_rq(cpu)->cache_siblings), + &grq.cpu_idle_map); +} +#endif + void __init sched_init_smp(void) { struct sched_domain *sd; @@ -6036,6 +6146,7 @@ void __init sched_init_smp(void) */ rr_interval *= 1 + ilog2(num_online_cpus()); + grq_lock_irq(); /* * Set up the relative cache distance of each online cpu from each * other in a simple array for quick lookup. Locality is determined @@ -6046,11 +6157,23 @@ void __init sched_init_smp(void) * nodes) are treated as very distant. */ for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); for_each_domain(cpu, sd) { - struct rq *rq = cpu_rq(cpu); unsigned long locality; int other_cpu; +#ifdef CONFIG_SCHED_SMT + if (sd->level == SD_LV_SIBLING) { + for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) + cpumask_set_cpu(other_cpu, &rq->smt_siblings); + } +#endif +#ifdef CONFIG_SCHED_MC + if (sd->level == SD_LV_MC) { + for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) + cpumask_set_cpu(other_cpu, &rq->cache_siblings); + } +#endif if (sd->level <= SD_LV_MC) locality = 0; else if (sd->level <= SD_LV_NODE) @@ -6063,7 +6186,21 @@ void __init sched_init_smp(void) rq->cpu_locality[other_cpu] = locality; } } + +/* + * Each runqueue has its own function in case it doesn't have + * siblings of its own allowing mixed topologies. + */ +#ifdef CONFIG_SCHED_SMT + if (cpus_weight(rq->smt_siblings) > 1) + rq->siblings_idle = siblings_cpu_idle; +#endif +#ifdef CONFIG_SCHED_MC + if (cpus_weight(rq->cache_siblings) > 1) + rq->cache_idle = cache_cpu_idle; +#endif } + grq_unlock_irq(); } #else void __init sched_init_smp(void) @@ -6119,6 +6256,18 @@ void __init sched_init(void) int j; rq = cpu_rq(i); +#ifdef CONFIG_SCHED_SMT + cpumask_clear(&rq->smt_siblings); + cpumask_set_cpu(i, &rq->smt_siblings); + rq->siblings_idle = sole_cpu_idle; + cpumask_set_cpu(i, &rq->smt_siblings); +#endif +#ifdef CONFIG_SCHED_MC + cpumask_clear(&rq->cache_siblings); + cpumask_set_cpu(i, &rq->cache_siblings); + rq->cache_idle = sole_cpu_idle; + cpumask_set_cpu(i, &rq->cache_siblings); +#endif rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(unsigned long), GFP_NOWAIT); for_each_possible_cpu(j) {