- Remove the expiration of old deadlines that has always been there to cope with wrapping jiffy counts on 32 bit by using a local 64 bit jiffy equivalent on 32 bit builds. This will make a difference for tasks that sit idle for a long time, and under conditions of extreme load (>8 x CPUs) on all builds. - Update documentation to reflect changes. --- Documentation/scheduler/sched-BFS.txt | 9 ---- include/linux/sched.h | 4 - kernel/sched_bfs.c | 72 ++++++++++++++++++++++++---------- 3 files changed, 55 insertions(+), 30 deletions(-) Index: linux-2.6.35-ck1/include/linux/sched.h =================================================================== --- linux-2.6.35-ck1.orig/include/linux/sched.h 2010-08-27 09:03:20.385510183 +1000 +++ linux-2.6.35-ck1/include/linux/sched.h 2010-08-27 09:04:55.073532842 +1000 @@ -1197,7 +1197,7 @@ struct task_struct { unsigned int rt_priority; #ifdef CONFIG_SCHED_BFS int time_slice, first_time_slice; - unsigned long deadline; + u64 deadline; struct list_head run_list; u64 last_ran; u64 sched_time; /* sched_clock time spent running */ @@ -1546,7 +1546,7 @@ static inline void tsk_cpus_current(stru static inline void print_scheduler_version(void) { - printk(KERN_INFO"BFS CPU scheduler v0.323 by Con Kolivas.\n"); + printk(KERN_INFO"BFS CPU scheduler v0.330 by Con Kolivas.\n"); } static inline int iso_task(struct task_struct *p) Index: linux-2.6.35-ck1/kernel/sched_bfs.c =================================================================== --- linux-2.6.35-ck1.orig/kernel/sched_bfs.c 2010-08-27 09:03:20.397509927 +1000 +++ linux-2.6.35-ck1/kernel/sched_bfs.c 2010-08-27 15:24:17.391432526 +1000 @@ -156,6 +156,10 @@ struct global_rq { unsigned long qnr; /* queued not running */ cpumask_t cpu_idle_map; #endif +#if BITS_PER_LONG < 64 + unsigned long jiffies; + u64 jiffies_64; +#endif }; /* There can be only one */ @@ -531,6 +535,43 @@ static inline void finish_lock_switch(st #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * In order to have a monotonic clock that does not wrap we have a 64 bit + * unsigned long that's protected by grq.lock used in place of jiffies on + * 32 bit builds. + */ +#if BITS_PER_LONG < 64 +static inline void update_gjiffies(void) +{ + if (grq.jiffies != jiffies) { + grq_lock(); + grq.jiffies = jiffies; + grq.jiffies_64++; + grq_unlock(); + } +} + +#define gjiffies (grq.jiffies_64) + +#else /* BITS_PER_LONG < 64 */ +static inline void update_gjiffies(void) +{ +} + +#define gjiffies jiffies + +#endif /* BITS_PER_LONG < 64 */ + +static inline int deadline_before(u64 deadline, u64 time) +{ + return (deadline < time); +} + +static inline int deadline_after(u64 deadline, u64 time) +{ + return (deadline > time); +} + +/* * A task that is queued but not running will be on the grq run list. * A task that is not running or queued will not be on the grq run list. * A task that is currently running will have ->oncpu set but not on the @@ -1191,7 +1232,7 @@ static void try_preempt(struct task_stru cache_distance(this_rq, rq, p); if (rq_prio > highest_prio || - (time_after(offset_deadline, latest_deadline) || + (deadline_after(offset_deadline, latest_deadline) || (offset_deadline == latest_deadline && this_rq == rq))) { latest_deadline = offset_deadline; highest_prio = rq_prio; @@ -1200,7 +1241,8 @@ static void try_preempt(struct task_stru } if (p->prio > highest_prio || (p->prio == highest_prio && - p->policy == SCHED_NORMAL && !time_before(p->deadline, latest_deadline))) + p->policy == SCHED_NORMAL && + !deadline_before(p->deadline, latest_deadline))) return; /* p gets to preempt highest_prio_rq->curr */ @@ -1212,7 +1254,7 @@ static void try_preempt(struct task_stru { if (p->prio < uprq->rq_prio || (p->prio == uprq->rq_prio && p->policy == SCHED_NORMAL && - time_before(p->deadline, uprq->rq_deadline))) + deadline_before(p->deadline, uprq->rq_deadline))) resched_task(uprq->curr); } #endif /* CONFIG_SMP */ @@ -2258,6 +2300,7 @@ void scheduler_tick(void) sched_clock_tick(); update_rq_clock(rq); update_cpu_clock(rq, rq->curr, 1); + update_gjiffies(); if (!rq_idle(rq)) task_running_tick(rq); else @@ -2323,7 +2366,7 @@ EXPORT_SYMBOL(sub_preempt_count); #endif /* - * Deadline is "now" in jiffies + (offset by priority). Setting the deadline + * Deadline is "now" in gjiffies + (offset by priority). Setting the deadline * is the key to everything. It distributes cpu fairly amongst tasks of the * same nice value, it proportions cpu according to nice level, it means the * task that last woke up the longest ago has the earliest deadline, thus @@ -2359,7 +2402,7 @@ static inline void time_slice_expired(st { reset_first_time_slice(p); p->time_slice = timeslice(); - p->deadline = jiffies + task_deadline_diff(p); + p->deadline = gjiffies + task_deadline_diff(p); } static inline void check_deadline(struct task_struct *p) @@ -2385,11 +2428,6 @@ static inline void check_deadline(struct * earliest deadline. * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are * selected by the earliest deadline. - * Once deadlines are expired (jiffies has passed it) tasks are chosen in FIFO - * order. Note that very few tasks will be FIFO for very long because they - * only end up that way if they sleep for long or if if there are enough fully - * cpu bound tasks to push the load to ~8 higher than the number of CPUs for - * nice 0. */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, struct task_struct *idle) @@ -2419,21 +2457,12 @@ retry: dl = p->deadline + cache_distance(task_rq(p), rq, p); /* - * Look for tasks with old deadlines and pick them in FIFO - * order, taking the first one found. - */ - if (time_is_before_jiffies(dl)) { - edt = p; - goto out_take; - } - - /* * No rt tasks. Find the earliest deadline task. Now we're in * O(n) territory. This is what we silenced the compiler for: * edt will always start as idle. */ if (edt == idle || - time_before(dl, earliest_deadline)) { + deadline_before(dl, earliest_deadline)) { earliest_deadline = dl; edt = p; } @@ -3308,7 +3337,8 @@ int task_prio(const struct task_struct * if (prio <= 0) goto out; - delta = (p->deadline - jiffies) * 40 / longest_deadline_diff(); + delta = p->deadline - jiffies; + delta = delta * 40 / longest_deadline_diff(); if (delta > 0 && delta <= 80) prio += delta; if (idleprio_task(p)) Index: linux-2.6.35-ck1/Documentation/scheduler/sched-BFS.txt =================================================================== --- linux-2.6.35-ck1.orig/Documentation/scheduler/sched-BFS.txt 2010-08-27 13:37:20.128593008 +1000 +++ linux-2.6.35-ck1/Documentation/scheduler/sched-BFS.txt 2010-08-27 15:08:19.143829226 +1000 @@ -118,12 +118,7 @@ that it has an earlier virtual deadline earlier deadline is the key to which task is next chosen for the first and second cases. Once a task is descheduled, it is put back on the queue, and an O(n) lookup of all queued-but-not-running tasks is done to determine which has -the earliest deadline and that task is chosen to receive CPU next. The one -caveat to this is that if a deadline has already passed (jiffies is greater -than the deadline), the tasks are chosen in FIFO (first in first out) order as -the deadlines are old and their absolute value becomes decreasingly relevant -apart from being a flag that they have been asleep and deserve CPU time ahead -of all later deadlines. +the earliest deadline and that task is chosen to receive CPU next. The CPU proportion of different nice tasks works out to be approximately the @@ -353,4 +348,4 @@ of total wall clock time taken and total "cpu usage". -Con Kolivas Thu Dec 3 2009 +Con Kolivas Fri Aug 27 2010