Use a grq variable to store the num_online_cpus count and only update it when the count changes, avoiding the overhead of calling num_online_cpus() each time. Make sure we only unset the idle map when we don't get the idle task back from edt(). Add an alternative version of edt() that allows a CPU to go idle instead of taking a CPU bound task from another CPU. This makes CPU bound tasks have soft affinity for one CPU which helps when a CPU load dependent frequency scaling algorithm is used (like ondemand). This increases throughput when load is low, and allows tasks to complete sooner which helps save power. Tie in the use of scaled_edt with these governors per CPU. Remove meaningless warnings and checks in try to wake up local. They've been worked around in BFS and now only add overhead. Round up this_cpu_load since we use the queued not running amount instead of the total running, and then add whether the local rq is running or not. -ck --- drivers/cpufreq/cpufreq_conservative.c | 2 drivers/cpufreq/cpufreq_ondemand.c | 2 drivers/cpufreq/cpufreq_userspace.c | 3 include/linux/sched.h | 14 ++ kernel/sched_bfs.c | 174 ++++++++++++++++++++++++++++++--- 5 files changed, 182 insertions(+), 13 deletions(-) Index: linux-2.6.38.2-ck1/kernel/sched_bfs.c =================================================================== --- linux-2.6.38.2-ck1.orig/kernel/sched_bfs.c 2011-03-28 22:36:16.078819659 +1100 +++ linux-2.6.38.2-ck1/kernel/sched_bfs.c 2011-03-28 22:36:18.605819659 +1100 @@ -89,7 +89,7 @@ #define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) #define iso_task(p) unlikely((p)->policy == SCHED_ISO) #define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) -#define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1) +#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -171,6 +171,7 @@ struct global_rq { cpumask_t cpu_idle_map; int idle_cpus; #endif + int noc; /* num_online_cpus stored and updated when it changes */ u64 niffies; /* Nanosecond jiffies */ unsigned long last_jiffy; /* Last jiffy we updated niffies */ @@ -204,6 +205,7 @@ struct rq { u64 rq_last_ran; int rq_prio; int rq_running; /* There is a task running */ + struct task_struct * (*edt)(struct rq *, struct task_struct *); /* Accurate timekeeping data */ u64 timekeep_clock; @@ -378,6 +380,31 @@ static inline void update_clocks(struct grq.niffies += ndiff; rq->last_niffy = grq.niffies; } + +static inline void set_skip_count(struct task_struct *p) +{ + p->cpu_skip = grq.noc - 1; +} + +static inline void reset_skip_count(struct task_struct *p) +{ + p->cpu_skip = 0; +} + +static inline int skip_count(struct task_struct *p) +{ + return p->cpu_skip; +} + +static inline void dec_skip_count(struct task_struct *p) +{ + p->cpu_skip--; +} + +static inline void inc_skip_count(struct task_struct *p) +{ + p->cpu_skip++; +} #else /* CONFIG_SMP */ static struct rq *uprq; #define cpu_rq(cpu) (uprq) @@ -402,7 +429,23 @@ static inline void update_clocks(struct grq.last_jiffy += jdiff; grq.niffies += ndiff; } + +static inline void set_skip_count(struct task_struct *__unused) +{ +} + +static inline void reset_skip_count(struct task_struct *__unused) +{ +} + +static inline void dec_skip_count(struct task_struct *__unused) +{ +} #endif + +EXPORT_SYMBOL_GPL(cpu_scales); +EXPORT_SYMBOL_GPL(cpu_nonscaling); + #define raw_rq() (&__raw_get_cpu_var(runqueues)) #include "sched_stats.h" @@ -999,6 +1042,7 @@ static inline void deactivate_task(struc { if (task_contributes_to_load(p)) grq.nr_uninterruptible++; + reset_skip_count(p); grq.nr_running--; } @@ -1039,6 +1083,7 @@ static inline void return_task(struct ta if (deactivate) deactivate_task(p); else { + set_skip_count(p); inc_qnr(); enqueue_task(p); } @@ -1479,10 +1524,6 @@ static void try_to_wake_up_local(struct struct rq *rq = task_rq(p); bool success = false; - WARN_ON(rq != this_rq()); - BUG_ON(p == current); - lockdep_assert_held(&grq.lock); - if (!(p->state & TASK_NORMAL)) return; @@ -1611,6 +1652,7 @@ void sched_fork(struct task_struct *p, i time_slice_expired(p); } p->last_ran = rq->rq_last_ran; + reset_skip_count(p); task_grq_unlock_irq(); out: put_cpu(); @@ -1921,7 +1963,7 @@ unsigned long this_cpu_load(void) { return this_rq()->rq_running + (queued_notrunning() + nr_uninterruptible()) / - (1 + num_online_cpus()); + (grq.noc ? : 1); } /* Variables and functions for calc_load */ @@ -2731,7 +2773,7 @@ static inline void check_deadline(struct * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are * selected by the earliest deadline. */ -static inline struct +static struct task_struct *earliest_deadline_task(struct rq *rq, struct task_struct *idle) { u64 dl, earliest_deadline = 0; /* Initialise to silence compiler */ @@ -2779,6 +2821,110 @@ out: return edt; } +#ifdef CONFIG_SMP +/* + * This is an alternate version of earliest_deadline_task that is used once + * this CPU has loaded a load-scaled CPU frequency governor. + */ +static struct task_struct *scaled_edt(struct rq *rq, struct task_struct *idle) +{ + u64 dl, earliest_deadline = 0; + struct task_struct *p, *edt = idle; + unsigned int cpu = cpu_of(rq); + struct list_head candidates, *queue; + int idx = 0; + +retry: + idx = find_next_bit(grq.prio_bitmap, PRIO_LIMIT, idx); + if (idx >= PRIO_LIMIT) + goto out; + + INIT_LIST_HEAD(&candidates); + queue = grq.queue + idx; + list_for_each_entry(p, queue, run_list) { + /* Make sure cpu affinity is ok */ + if (needs_other_cpu(p, cpu)) + continue; + if (idx < MAX_RT_PRIO) { + /* We found an rt task */ + edt = p; + goto out_take; + } + list_add(&p->cand_list, &candidates); + } + +retry_same: + list_for_each_entry(p, &candidates, cand_list) { + dl = p->deadline + cache_distance(task_rq(p), rq, p); + + if (edt == idle || + deadline_before(dl, earliest_deadline)) { + earliest_deadline = dl; + edt = p; + } else if (task_cpu(p) == cpu) + inc_skip_count(p); + } + /* + * If this task is a CPU bound task and was not bound to this CPU last + * time it ran, we skip over it up to cpu_skip count (one per CPU that + * it's not bound to) times. This makes it much more likely to go back + * to the same CPU it came from. When we are using a scaling CPU + * governor, this will make the CPU it's bound to far more likely to + * speed up to high frequency, allowing the task to finish in less time + * and use less power overall. If we do this with fixed frequency CPUs + * it costs us in latency and performance. + */ + if (task_cpu(edt) != cpu && skip_count(edt) && !rt_task(edt)) { + dec_skip_count(edt); + list_del(&edt->cand_list); + edt = idle; + if (!list_empty(&candidates)) + goto retry_same; + } + if (edt == idle) { + if (++idx < PRIO_LIMIT) + goto retry; + goto out; + } +out_take: + take_task(rq, edt); +out: + return edt; +} + +/* + * Each runqueue in SMP decides what version of earliest_deadline_task to use + * according to whether it's being managed by a CPU frequency governor that + * scales with load or is static. + */ +void cpu_scales(int cpu) +{ + unsigned long flags; + + grq_lock_irqsave(&flags); + cpu_rq(cpu)->edt = scaled_edt; + grq_unlock_irqrestore(&flags); +} + +void cpu_nonscaling(int cpu) +{ + unsigned long flags; + + grq_lock_irqsave(&flags); + cpu_rq(cpu)->edt = earliest_deadline_task; + grq_unlock_irqrestore(&flags); +} +#else +void cpu_scales(int __unused) +{ +} + +void cpu_nonscaling(int __unused) +{ +} + +#endif + /* * Print scheduling while atomic bug: */ @@ -2952,10 +3098,12 @@ need_resched_nonpreemptible: schedstat_inc(rq, sched_goidle); set_cpuidle_map(cpu); } else { - next = earliest_deadline_task(rq, idle); - prefetch(next); - prefetch_stack(next); - clear_cpuidle_map(cpu); + next = rq->edt(rq, idle); + if (likely(next->prio != PRIO_LIMIT)) { + prefetch(next); + prefetch_stack(next); + clear_cpuidle_map(cpu); + } } if (likely(prev != next)) { @@ -5059,6 +5207,7 @@ migration_call(struct notifier_block *nf set_rq_online(rq); } + grq.noc = num_online_cpus(); grq_unlock_irqrestore(&flags); break; @@ -5083,6 +5232,7 @@ migration_call(struct notifier_block *nf BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } + grq.noc = num_online_cpus(); grq_unlock_irqrestore(&flags); break; #endif @@ -6846,6 +6996,7 @@ void __init sched_init_smp(void) rq->cache_idle = cache_cpu_idle; #endif } + grq.noc = num_online_cpus(); grq_unlock_irq(); } #else @@ -6890,6 +7041,7 @@ void __init sched_init(void) rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = rq->iowait_pc = rq->idle_pc = 0; rq->dither = 0; + rq->edt = earliest_deadline_task; #ifdef CONFIG_SMP rq->last_niffy = 0; rq->sd = NULL; Index: linux-2.6.38.2-ck1/include/linux/sched.h =================================================================== --- linux-2.6.38.2-ck1.orig/include/linux/sched.h 2011-03-28 22:36:17.033819659 +1100 +++ linux-2.6.38.2-ck1/include/linux/sched.h 2011-03-28 22:36:18.606819659 +1100 @@ -1219,11 +1219,12 @@ struct task_struct { #ifdef CONFIG_SCHED_BFS int time_slice; u64 deadline; - struct list_head run_list; + struct list_head run_list, cand_list; u64 last_ran; u64 sched_time; /* sched_clock time spent running */ unsigned long rt_timeout; + int cpu_skip; #else /* CONFIG_SCHED_BFS */ const struct sched_class *sched_class; struct sched_entity se; @@ -1555,6 +1556,8 @@ struct task_struct { #ifdef CONFIG_SCHED_BFS extern int grunqueue_is_locked(void); extern void grq_unlock_wait(void); +extern void cpu_scales(int cpu); +extern void cpu_nonscaling(int cpu); #define tsk_seruntime(t) ((t)->sched_time) #define tsk_rttimeout(t) ((t)->rt_timeout) @@ -1566,7 +1569,7 @@ static inline void tsk_cpus_current(stru static inline void print_scheduler_version(void) { - printk(KERN_INFO"BFS CPU scheduler v0.363 by Con Kolivas.\n"); + printk(KERN_INFO"BFS CPU scheduler v0.370 by Con Kolivas.\n"); } static inline int iso_task(struct task_struct *p) @@ -1577,6 +1580,13 @@ extern void remove_cpu(unsigned long cpu extern int above_background_load(void); #else /* CFS */ extern int runqueue_is_locked(int cpu); +static inline void cpu_scales(int cpu) +{ +} + +static inline void cpu_nonscaling(int cpu) +{ +} #define tsk_seruntime(t) ((t)->se.sum_exec_runtime) #define tsk_rttimeout(t) ((t)->rt.timeout) Index: linux-2.6.38.2-ck1/drivers/cpufreq/cpufreq_conservative.c =================================================================== --- linux-2.6.38.2-ck1.orig/drivers/cpufreq/cpufreq_conservative.c 2010-08-02 11:12:24.000000000 +1000 +++ linux-2.6.38.2-ck1/drivers/cpufreq/cpufreq_conservative.c 2011-03-28 22:36:18.606819659 +1100 @@ -661,6 +661,7 @@ static int cpufreq_governor_dbs(struct c dbs_timer_init(this_dbs_info); + cpu_scales(cpu); break; case CPUFREQ_GOV_STOP: @@ -685,6 +686,7 @@ static int cpufreq_governor_dbs(struct c sysfs_remove_group(cpufreq_global_kobject, &dbs_attr_group); + cpu_nonscaling(cpu); break; case CPUFREQ_GOV_LIMITS: Index: linux-2.6.38.2-ck1/drivers/cpufreq/cpufreq_ondemand.c =================================================================== --- linux-2.6.38.2-ck1.orig/drivers/cpufreq/cpufreq_ondemand.c 2011-03-28 22:36:18.304819659 +1100 +++ linux-2.6.38.2-ck1/drivers/cpufreq/cpufreq_ondemand.c 2011-03-28 22:36:18.606819659 +1100 @@ -782,6 +782,7 @@ static int cpufreq_governor_dbs(struct c mutex_init(&this_dbs_info->timer_mutex); dbs_timer_init(this_dbs_info); + cpu_scales(cpu); break; case CPUFREQ_GOV_STOP: @@ -796,6 +797,7 @@ static int cpufreq_governor_dbs(struct c sysfs_remove_group(cpufreq_global_kobject, &dbs_attr_group); + cpu_nonscaling(cpu); break; case CPUFREQ_GOV_LIMITS: Index: linux-2.6.38.2-ck1/drivers/cpufreq/cpufreq_userspace.c =================================================================== --- linux-2.6.38.2-ck1.orig/drivers/cpufreq/cpufreq_userspace.c 2009-06-10 13:05:27.000000000 +1000 +++ linux-2.6.38.2-ck1/drivers/cpufreq/cpufreq_userspace.c 2011-03-28 22:36:18.606819659 +1100 @@ -23,6 +23,7 @@ #include #include #include +#include /** * A few values needed by the userspace governor @@ -142,6 +143,7 @@ static int cpufreq_governor_userspace(st per_cpu(cpu_cur_freq, cpu)); mutex_unlock(&userspace_mutex); + cpu_scales(cpu); break; case CPUFREQ_GOV_STOP: mutex_lock(&userspace_mutex); @@ -158,6 +160,7 @@ static int cpufreq_governor_userspace(st per_cpu(cpu_set_freq, cpu) = 0; dprintk("managing cpu %u stopped\n", cpu); mutex_unlock(&userspace_mutex); + cpu_nonscaling(cpu); break; case CPUFREQ_GOV_LIMITS: mutex_lock(&userspace_mutex);