- Major overhaul of queued changes. - Micooptimise multiplications/divisions to be shifts where suitable. - Change ISO calculations to have their own lock so as to not grab the grq lock during a scheduler tick. - Change all deadline accounting to use microsecond values. - Introduce local miffies variable which is updated from any runqueue using the TSC clock whenever the grq lock is taken. Use miffies to compare deadlines to. This will give much more granular deadlines when jiffies are low resolution such as 100Hz, and rarely will tasks have the same deadlines now. - Drop the "skip_clock_update" concept as we update the miffies each time we update the rq clocks, thus we want to update it more often. - Rework try_preempt as SCHED_BATCH and SCHED_IDLEPRIO were not acting as cooperative multitasking policies as they were supposed to. - Bypass rechecking deadline when we know that prev will run again in schedule. - Check to see if prev can run on an idle CPU when being descheduled as may happen when a task must use a certain CPU for affinity reasons. -ck --- kernel/sched_bfs.c | 288 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 186 insertions(+), 102 deletions(-) Index: linux-2.6.35-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.35-bfs.orig/kernel/sched_bfs.c 2010-09-03 14:52:58.189995880 +1000 +++ linux-2.6.35-bfs/kernel/sched_bfs.c 2010-09-03 14:53:31.374451446 +1000 @@ -106,10 +106,14 @@ #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) #define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) -/* Some helpers for converting to/from various scales.*/ +/* + * Some helpers for converting to/from various scales. Use shifts to get + * approximate multiples of ten for less overhead. + */ #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define MS_TO_NS(TIME) ((TIME) * 1000000) -#define MS_TO_US(TIME) ((TIME) * 1000) +#define MS_TO_NS(TIME) ((TIME) << 20) +#define MS_TO_US(TIME) ((TIME) << 10) +#define NS_TO_US(TIME) ((TIME) >> 10) /* * This is the time all tasks within the same priority round robin. @@ -140,8 +144,9 @@ static inline unsigned long timeslice(vo } /* - * The global runqueue data that all CPUs work off. All data is protected - * by grq.lock. + * The global runqueue data that all CPUs work off. Data is protected either + * by the global grq lock, or the discrete lock that precedes the data in this + * struct. */ struct global_rq { raw_spinlock_t lock; @@ -150,17 +155,17 @@ struct global_rq { unsigned long long nr_switches; struct list_head queue[PRIO_LIMIT]; DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); - int iso_ticks; - int iso_refractory; #ifdef CONFIG_SMP unsigned long qnr; /* queued not running */ cpumask_t cpu_idle_map; int idle_cpus; #endif -#if BITS_PER_LONG < 64 - unsigned long jiffies; - u64 jiffies_64; -#endif + /* Microsecond jiffies */ + u64 miffies; + + raw_spinlock_t iso_lock; + int iso_ticks; + int iso_refractory; }; /* There can be only one */ @@ -177,7 +182,6 @@ struct rq { unsigned char in_nohz_recently; #endif #endif - unsigned int skip_clock_update; struct task_struct *curr, *idle; struct mm_struct *prev_mm; @@ -213,9 +217,10 @@ struct rq { /* See if all cache siblings are idle */ cpumask_t cache_siblings; #endif + u64 last_miffy; /* Last time this RQ updated grq.miffies */ #endif - u64 clock; + u64 clock, old_clock; #ifdef CONFIG_SCHEDSTATS /* latency stats */ @@ -286,15 +291,6 @@ struct root_domain { static struct root_domain def_root_domain; #endif -static inline int cpu_of(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq->cpu; -#else - return 0; -#endif -} - #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ rcu_read_lock_sched_held() || \ @@ -310,17 +306,67 @@ static inline int cpu_of(struct rq *rq) #define for_each_domain(cpu, __sd) \ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) +static inline void update_rq_clock(struct rq *rq); + #ifdef CONFIG_SMP #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static inline int cpu_of(struct rq *rq) +{ + return rq->cpu; +} + +/* + * Miffies are a globally increasing microsecond counter. Whenever a runqueue + * clock is updated with the grq.lock held, it is an opportunity to update the + * miffies value. Any CPU can update it by adding how much its clock has + * increased since it last updated miffies, minus any added miffies by other + * CPUs. + */ +static inline void update_clocks(struct rq *rq) +{ + long udiff; + + update_rq_clock(rq); + udiff = rq->clock - rq->old_clock; + /* old_clock is only updated when we are updating miffies */ + rq->old_clock = rq->clock; + udiff = NS_TO_US(udiff); + udiff -= grq.miffies - rq->last_miffy; + /* + * Sanity check should sched_clock return bogus values or be limited to + * just jiffy resolution. Some time will always have passed. + */ + if (unlikely(udiff < 1 || udiff > MS_TO_US(rr_interval))) + udiff = 1; + grq.miffies += udiff; + rq->last_miffy = grq.miffies; +} #else /* CONFIG_SMP */ static struct rq *uprq; #define cpu_rq(cpu) (uprq) #define this_rq() (uprq) #define task_rq(p) (uprq) #define cpu_curr(cpu) ((uprq)->curr) +static inline int cpu_of(struct rq *rq) +{ + return 0; +} + +static inline void update_clocks(struct rq *rq) +{ + long udiff; + + update_rq_clock(rq); + udiff = rq->clock - rq->old_clock; + rq->old_clock = rq->clock; + udiff = NS_TO_US(udiff); + if (unlikely(udiff < 1 || udiff > MS_TO_US(rr_interval))) + udiff = 1; + grq.miffies += udiff; +} #endif #define raw_rq() (&__raw_get_cpu_var(runqueues)) @@ -335,13 +381,13 @@ static struct rq *uprq; /* * All common locking functions performed on grq.lock. rq->clock is local to - * the cpu accessing it so it can be modified just with interrupts disabled, - * but looking up task_rq must be done under grq.lock to be safe. + * the cpu accessing it so it can be modified just with interrupts disabled + * when we're not updating miffies. + * Looking up task_rq must be done under grq.lock to be safe. */ -inline void update_rq_clock(struct rq *rq) +static inline void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + rq->clock = sched_clock_cpu(cpu_of(rq)); } static inline int task_running(struct task_struct *p) @@ -370,8 +416,8 @@ static inline void grq_lock_irq(void) static inline void time_lock_grq(struct rq *rq) __acquires(grq.lock) { - update_rq_clock(rq); grq_lock(); + update_clocks(rq); } static inline void grq_unlock_irq(void) @@ -405,7 +451,7 @@ static inline struct rq __acquires(grq.lock) { struct rq *rq = task_grq_lock(p, flags); - update_rq_clock(rq); + update_clocks(rq); return rq; } @@ -420,7 +466,7 @@ static inline void time_task_grq_lock_ir __acquires(grq.lock) { struct rq *rq = task_grq_lock_irq(p); - update_rq_clock(rq); + update_clocks(rq); } static inline void task_grq_unlock_irq(void) @@ -515,33 +561,6 @@ static inline void finish_lock_switch(st } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ -/* - * In order to have a monotonic clock that does not wrap we have a 64 bit - * unsigned long that's protected by grq.lock used in place of jiffies on - * 32 bit builds. - */ -#if BITS_PER_LONG < 64 -static inline void update_gjiffies(void) -{ - if (grq.jiffies != jiffies) { - grq_lock(); - grq.jiffies = jiffies; - grq.jiffies_64++; - grq_unlock(); - } -} - -#define gjiffies (grq.jiffies_64) - -#else /* BITS_PER_LONG < 64 */ -static inline void update_gjiffies(void) -{ -} - -#define gjiffies jiffies - -#endif /* BITS_PER_LONG < 64 */ - static inline int deadline_before(u64 deadline, u64 time) { return (deadline < time); @@ -646,11 +665,11 @@ static inline int task_prio_ratio(struct /* * task_timeslice - all tasks of all priorities get the exact same timeslice * length. CPU distribution is handled by giving different deadlines to - * tasks of different priorities. + * tasks of different priorities. Use 128 as the base value for fast shifts. */ static inline int task_timeslice(struct task_struct *p) { - return (rr_interval * task_prio_ratio(p) / 100); + return (rr_interval * task_prio_ratio(p) / 128); } #ifdef CONFIG_SMP @@ -887,7 +906,7 @@ static int effective_prio(struct task_st */ static void activate_task(struct task_struct *p, struct rq *rq) { - update_rq_clock(rq); + update_clocks(rq); /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -1159,6 +1178,32 @@ EXPORT_SYMBOL_GPL(kick_process); #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) #define task_idle(p) ((p)->prio == PRIO_LIMIT) +/* + * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the + * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt + * between themselves, they cooperatively multitask. An idle rq scores as + * prio PRIO_LIMIT so it is always preempted. latest_deadline and + * highest_prio_rq are initialised only to silence the compiler. When + * all else is equal, still prefer this_rq. + */ +static inline int +can_preempt(struct task_struct *p, int prio, unsigned long deadline, + unsigned int policy) +{ + /* Better static priority RT task or better policy preemption */ + if (p->prio < prio) + return 1; + if (p->prio > prio) + return 0; + /* BATCH tasks cooperatively multitask */ + if (p->policy == SCHED_BATCH && policy != SCHED_IDLEPRIO) + return 0; + /* SCHED_ISO and SCHED_NORMAL will preempt based on deadline */ + if (!deadline_before(p->deadline, deadline)) + return 0; + return 1; +} +#ifdef CONFIG_SMP #ifdef CONFIG_HOTPLUG_CPU /* * Check to see if there is a task that is affined only to offline CPUs but @@ -1178,14 +1223,16 @@ static inline int online_cpus(struct tas #endif /* - * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the - * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt - * between themselves, they cooperatively multitask. An idle rq scores as - * prio PRIO_LIMIT so it is always preempted. latest_deadline and - * highest_prio_rq are initialised only to silence the compiler. When - * all else is equal, still prefer this_rq. + * Check to see if p can run on cpu, and if not, whether there are any online + * CPUs it can run on instead. */ -#ifdef CONFIG_SMP +static inline int needs_other_cpu(struct task_struct *p, int cpu) +{ + if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p))) + return 1; + return 0; +} + static void try_preempt(struct task_struct *p, struct rq *this_rq) { struct rq *highest_prio_rq = this_rq; @@ -1193,6 +1240,10 @@ static void try_preempt(struct task_stru int highest_prio; cpumask_t tmp; + /* IDLEPRIO tasks never preempt anything */ + if (p->policy == SCHED_IDLEPRIO) + return; + if (suitable_idle_cpus(p)) { resched_best_idle(p); return; @@ -1228,21 +1279,24 @@ static void try_preempt(struct task_stru } } - if (p->prio > highest_prio || (p->prio == highest_prio && - p->policy == SCHED_NORMAL && - !deadline_before(p->deadline, latest_deadline))) + if (!can_preempt(p, highest_prio, highest_prio_rq->rq_deadline, + highest_prio_rq->rq_policy)) return; - /* p gets to preempt highest_prio_rq->curr */ resched_task(highest_prio_rq->curr); - highest_prio_rq->skip_clock_update = 1; } #else /* CONFIG_SMP */ +static inline int needs_other_cpu(struct task_struct *p, int cpu) +{ + return 0; +} + static void try_preempt(struct task_struct *p, struct rq *this_rq) { - if (p->prio < uprq->rq_prio || - (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL && - deadline_before(p->deadline, uprq->rq_deadline))) + if (p->policy == SCHED_IDLEPRIO) + return; + if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline, + uprq->rq_policy)) resched_task(uprq->curr); } #endif /* CONFIG_SMP */ @@ -1981,7 +2035,7 @@ update_cpu_clock(struct rq *rq, struct t else if (unlikely(time_diff > JIFFIES_TO_NS(1))) time_diff = JIFFIES_TO_NS(1); - rq->rq_time_slice -= time_diff / 1000; + rq->rq_time_slice -= NS_TO_US(time_diff); } rq->rq_last_ran = rq->timekeep_clock = rq->clock; } @@ -1997,7 +2051,7 @@ static u64 do_task_delta_exec(struct tas u64 ns = 0; if (p == rq->curr) { - update_rq_clock(rq); + update_clocks(rq); ns = rq->clock - rq->rq_last_ran; if (unlikely((s64)ns < 0)) ns = 0; @@ -2171,10 +2225,22 @@ void account_idle_ticks(unsigned long ti } #endif +static inline void grq_iso_lock(void) + __acquires(grq.iso_lock) +{ + raw_spin_lock(&grq.iso_lock); +} + +static inline void grq_iso_unlock(void) + __releases(grq.iso_lock) +{ + raw_spin_unlock(&grq.iso_lock); +} + /* * Functions to test for when SCHED_ISO tasks have used their allocated * quota as real time scheduling and convert them back to SCHED_NORMAL. - * Where possible, the data is tested lockless, to avoid grabbing grq_lock + * Where possible, the data is tested lockless, to avoid grabbing iso_lock * because the occasional inaccurate result won't matter. However the * tick data is only ever modified under lock. iso_refractory is only simply * set to 0 or 1 so it's not worth grabbing the lock yet again for that. @@ -2209,21 +2275,21 @@ static unsigned int test_ret_isorefracto static void iso_tick(void) { - grq_lock(); + grq_iso_lock(); grq.iso_ticks += 100; - grq_unlock(); + grq_iso_unlock(); } /* No SCHED_ISO task was running so decrease rq->iso_ticks */ static inline void no_iso_tick(void) { if (grq.iso_ticks) { - grq_lock(); + grq_iso_lock(); grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; if (unlikely(grq.iso_refractory && grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))) clear_iso_refractory(); - grq_unlock(); + grq_iso_unlock(); } } @@ -2286,9 +2352,9 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); sched_clock_tick(); + /* grq lock not grabbed, so only update rq clock */ update_rq_clock(rq); update_cpu_clock(rq, rq->curr, 1); - update_gjiffies(); if (!rq_idle(rq)) task_running_tick(rq); else @@ -2354,7 +2420,7 @@ EXPORT_SYMBOL(sub_preempt_count); #endif /* - * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline + * Deadline is "now" in miffies + (offset by priority). Setting the deadline * is the key to everything. It distributes cpu fairly amongst tasks of the * same nice value, it proportions cpu according to nice level, it means the * task that last woke up the longest ago has the earliest deadline, thus @@ -2364,7 +2430,7 @@ EXPORT_SYMBOL(sub_preempt_count); */ static inline int prio_deadline_diff(int user_prio) { - return (prio_ratios[user_prio] * rr_interval * HZ / (1000 * 100)) ? : 1; + return (prio_ratios[user_prio] * rr_interval * (1024 / 128)); } static inline int task_deadline_diff(struct task_struct *p) @@ -2390,7 +2456,7 @@ static inline void time_slice_expired(st { reset_first_time_slice(p); p->time_slice = timeslice(); - p->deadline = gjiffies + task_deadline_diff(p); + p->deadline = grq.miffies + task_deadline_diff(p); } static inline void check_deadline(struct task_struct *p) @@ -2433,7 +2499,7 @@ retry: queue = grq.queue + idx; list_for_each_entry(p, queue, run_list) { /* Make sure cpu affinity is ok */ - if (online_cpus(p) && !cpu_isset(cpu, p->cpus_allowed)) + if (needs_other_cpu(p, cpu)) continue; if (idx < MAX_RT_PRIO) { /* We found an rt task */ @@ -2560,12 +2626,10 @@ need_resched_nonpreemptible: deactivate = 0; schedule_debug(prev); - local_irq_disable(); - update_rq_clock(rq); + grq_lock_irq(); + update_clocks(rq); update_cpu_clock(rq, prev, 0); - rq->skip_clock_update = 0; - grq_lock(); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -2581,13 +2645,31 @@ need_resched_nonpreemptible: prev->time_slice = rq->rq_time_slice; prev->deadline = rq->rq_deadline; check_deadline(prev); - return_task(prev, deactivate); - /* Task changed affinity off this cpu */ - if (unlikely(!cpus_intersects(prev->cpus_allowed, - cpumask_of_cpu(cpu)))) { - if (online_cpus(prev)) + prev->last_ran = rq->clock; + + /* Task changed affinity off this CPU */ + if (needs_other_cpu(prev, cpu)) + resched_suitable_idle(prev); + else if (!deactivate) { + if (!queued_notrunning()) { + /* + * We now know prev is the only thing that is + * awaiting CPU so we can bypass rechecking for + * the earliest deadline task and just run it + * again. + */ + grq_unlock_irq(); + goto rerun_prev_unlocked; + } else { + /* + * If prev got kicked off by a task that has to + * run on this CPU for affinity reasons then + * there may be an idle CPU it can go to. + */ resched_suitable_idle(prev); } + } + return_task(prev, deactivate); } if (likely(queued_notrunning())) { @@ -2605,8 +2687,6 @@ need_resched_nonpreemptible: else clear_cpuidle_map(cpu); - prev->last_ran = rq->clock; - if (likely(prev != next)) { sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); @@ -2629,6 +2709,7 @@ need_resched_nonpreemptible: } else grq_unlock_irq(); +rerun_prev_unlocked: if (unlikely(reacquire_kernel_lock(current) < 0)) { prev = rq->curr; switch_count = &prev->nivcsw; @@ -3324,7 +3405,7 @@ int task_prio(const struct task_struct * if (prio <= 0) goto out; - delta = p->deadline - gjiffies; + delta = p->deadline - grq.miffies; delta = delta * 40 / longest_deadline_diff(); if (delta > 0 && delta <= 80) prio += delta; @@ -3533,7 +3614,7 @@ recheck: raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - update_rq_clock(rq); + update_clocks(rq); p->sched_reset_on_fork = reset_on_fork; queued = task_queued(p); @@ -4835,7 +4916,7 @@ migration_call(struct notifier_block *nf __setscheduler(idle, rq, SCHED_NORMAL, 0); idle->prio = PRIO_LIMIT; set_rq_task(rq, idle); - update_rq_clock(rq); + update_clocks(rq); grq_unlock_irq(); break; @@ -6531,12 +6612,14 @@ void __init sched_init(void) int i; struct rq *rq; - prio_ratios[0] = 100; + prio_ratios[0] = 128; for (i = 1 ; i < PRIO_RANGE ; i++) prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; raw_spin_lock_init(&grq.lock); grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; + grq.miffies = 0; + raw_spin_lock_init(&grq.iso_lock); grq.iso_ticks = grq.iso_refractory = 0; #ifdef CONFIG_SMP init_defrootdomain(); @@ -6550,6 +6633,7 @@ void __init sched_init(void) rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = rq->iowait_pc = rq->idle_pc = 0; #ifdef CONFIG_SMP + rq->last_miffy = 0; rq->sd = NULL; rq->rd = NULL; rq->online = 0;