--- Documentation/sched-design.txt | 11 Documentation/sysctl/kernel.txt | 14 fs/pipe.c | 7 fs/proc/array.c | 2 include/linux/init_task.h | 7 include/linux/sched.h | 29 - kernel/kthread.c | 1 kernel/sched.c | 1124 ++++++++++++---------------------------- kernel/softirq.c | 1 kernel/sysctl.c | 26 kernel/workqueue.c | 2 11 files changed, 423 insertions(+), 801 deletions(-) Index: linux-2.6.22-bfs/include/linux/sched.h =================================================================== --- linux-2.6.22-bfs.orig/include/linux/sched.h 2009-08-13 14:39:47.332620640 +1000 +++ linux-2.6.22-bfs/include/linux/sched.h 2009-08-13 14:40:31.230033654 +1000 @@ -150,8 +150,7 @@ #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 /* in tsk->state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_DEAD 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -537,8 +536,11 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO +#define PRIO_RANGE (40) +#define NORMAL_PRIO MAX_RT_PRIO +#define PRIO_LIMIT ((NORMAL_PRIO) + 1) -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) @@ -809,13 +811,6 @@ struct pipe_inode_info; struct uts_namespace; -enum sleep_type { - SLEEP_NORMAL, - SLEEP_NONINTERACTIVE, - SLEEP_INTERACTIVE, - SLEEP_INTERRUPTED, -}; - struct prio_array; struct task_struct { @@ -835,20 +830,24 @@ int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; struct list_head run_list; - struct prio_array *array; + unsigned long deadline; unsigned short ioprio; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif - unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - enum sleep_type sleep_type; unsigned int policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + /* + * How much this task is entitled to run at the current priority + * before being requeued at a lower priority. + */ + int time_slice; + /* Is this the very first time_slice this task has ever run. */ + unsigned int first_time_slice; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -1253,7 +1252,7 @@ #endif extern void set_user_nice(struct task_struct *p, long nice); -extern int task_prio(const struct task_struct *p); +extern int task_prio(struct task_struct *p); extern int task_nice(const struct task_struct *p); extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); Index: linux-2.6.22-bfs/kernel/sched.c =================================================================== --- linux-2.6.22-bfs.orig/kernel/sched.c 2009-08-13 14:39:47.337622975 +1000 +++ linux-2.6.22-bfs/kernel/sched.c 2009-08-13 14:40:31.233033299 +1000 @@ -16,6 +16,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * now Brainfuck deadline scheduling policy by Con Kolivas */ #include @@ -53,8 +54,9 @@ #include #include #include - +#include #include + #include /* @@ -84,148 +86,27 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +/* Some helpers for converting to/from various scales.*/ #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#define MS_TO_NS(TIME) ((TIME) * 1000000) +#define MS_TO_US(TIME) ((TIME) * 1000) +#define US_TO_MS(TIME) ((TIME) / 1000) -/* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. - */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} - -#ifdef CONFIG_SMP -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif +#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) /* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. + * This is the time all tasks within the same priority round robin. + * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. + * Tunable via /proc interface. */ +int rr_interval __read_mostly = 6; +int prio_ratios[PRIO_RANGE] __read_mostly; +int prio_multiples[PRIO_RANGE] __read_mostly; -static inline unsigned int task_timeslice(struct task_struct *p) -{ - return static_prio_timeslice(p->static_prio); -} -/* - * These are the runqueue data structures: - */ - -struct prio_array { - unsigned int nr_active; - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_PRIO]; -}; +struct rq; /* * This is the main, per-CPU runqueue data structure. @@ -260,14 +141,19 @@ */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - struct prio_array *active, *expired, arrays[2]; - int best_expired_prio; + + unsigned long queued_deadline; + unsigned long latest_deadline; + /* Tasks queued at each priority */ + struct list_head queue[PRIO_LIMIT]; + DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); + struct list_head *gqueue; + atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -606,12 +492,9 @@ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). + * the cpu. We should note that the expired queue will become the active + * queue after the active queue is empty, without explicitly dequeuing and + * requeuing tasks in the expired queue. * * This function is only called from sched_info_arrive(), rather than * dequeue_task(). Even though a task may be queued and dequeued multiple @@ -709,71 +592,71 @@ #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +static inline int task_queued(struct task_struct *task) +{ + return !list_empty(&task->run_list); +} + /* - * Adding/removing a task to/from a priority array: + * Removing from a runqueue. */ -static void dequeue_task(struct task_struct *p, struct prio_array *array) +static void dequeue_task(struct task_struct *p, struct rq *rq) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + list_del_init(&p->run_list); + if (list_empty(rq->queue + p->prio)) + __clear_bit(p->prio, rq->prio_bitmap); } -static void enqueue_task(struct task_struct *p, struct prio_array *array) +static inline void reset_first_time_slice(struct task_struct *p) { - sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + if (unlikely(p->first_time_slice)) + p->first_time_slice = 0; } /* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. + * Adding to a runqueue. time_slice is refilled here. */ -static void requeue_task(struct task_struct *p, struct prio_array *array) +static inline void __enqueue_task(struct task_struct *p, struct rq *rq) { - list_move_tail(&p->run_list, array->queue + p->prio); + /* Check against jiffies in case of wrap */ + if (p->deadline > rq->latest_deadline && + p->deadline > jiffies) + rq->latest_deadline = p->deadline; + __set_bit(p->prio, rq->prio_bitmap); + sched_info_queued(p); } -static inline void -enqueue_task_head(struct task_struct *p, struct prio_array *array) +static void enqueue_task(struct task_struct *p, struct rq *rq) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + __enqueue_task(p, rq); + list_add_tail(&p->run_list, rq->queue + p->prio); +} + +/* Only idle task does this as a real time task*/ +static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) +{ + __enqueue_task(p, rq); + list_add(&p->run_list, rq->queue + p->prio); } /* - * __normal_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. + * On the global queue for SCHED_NORMAL, the list position doesn't matter. */ - -static inline int __normal_prio(struct task_struct *p) +static void requeue_task(struct rq *rq, struct task_struct *p) { - int bonus, prio; + if (rt_task(p)) + list_move_tail(&p->run_list, rq->queue + p->prio); + sched_info_queued(p); +} - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; +static inline int prio_ratio(struct task_struct *p) +{ + return prio_ratios[USER_PRIO(p->static_prio)]; +} - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; +static inline int prio_multiple(struct task_struct *p) +{ + return prio_multiples[USER_PRIO(p->static_prio)]; } /* @@ -786,17 +669,19 @@ */ /* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE - * If static_prio_timeslice() is ever changed to break this assumption then - * this code will need modification - */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define LOAD_WEIGHT(lp) \ - (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - LOAD_WEIGHT(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + * task_timeslice + */ +static int task_timeslice(struct task_struct *p) +{ + return (rr_interval * prio_ratio(p) / 100); +} + +/* + * The load weight is basically the task_timeslice in ms. Realtime tasks are + * special cased to be proportionately larger than nice -20 by their + * rt_priority. The weight for rt tasks can only be arbitrary at best. + */ +#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * (40 + rp)) static void set_load_weight(struct task_struct *p) { @@ -813,7 +698,7 @@ #endif p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); } else - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); + p->load_weight = task_timeslice(p); } static inline void @@ -841,28 +726,34 @@ } /* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. + * __activate_task - move a task to the runqueue. */ -static inline int normal_prio(struct task_struct *p) +static inline void __activate_task(struct task_struct *p, struct rq *rq) { - int prio; + enqueue_task(p, rq); + inc_nr_running(p, rq); +} +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +{ + enqueue_task_head(p, rq); + inc_nr_running(p, rq); +} + +static inline int normal_prio(struct task_struct *p) +{ if (has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + return MAX_RT_PRIO - 1 - p->rt_priority; + return NORMAL_PRIO; } /* * Calculate the current priority, i.e. the priority * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got + * be boosted by RT tasks as it will be RT if the task got * RT-boosted. If not then it returns p->normal_prio. */ static int effective_prio(struct task_struct *p) @@ -879,111 +770,12 @@ } /* - * __activate_task - move a task to the runqueue. - */ -static void __activate_task(struct task_struct *p, struct rq *rq) -{ - struct prio_array *target = rq->active; - - if (batch_task(p)) - target = rq->expired; - enqueue_task(p, target); - inc_nr_running(p, rq); -} - -/* - * __activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) -{ - enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); -} - -/* - * Recalculate p->normal_prio and p->prio after having slept, - * updating the sleep-average too: - */ -static int recalc_task_prio(struct task_struct *p, unsigned long long now) -{ - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long sleep_time = now - p->timestamp; - - if (batch_task(p)) - sleep_time = 0; - - if (likely(sleep_time > 0)) { - /* - * This ceiling is set to the lowest priority that would allow - * a task to be reinserted into the active array on timeslice - * completion. - */ - unsigned long ceiling = INTERACTIVE_SLEEP(p); - - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { - /* - * Prevents user tasks from achieving best priority - * with one single large enough sleep. - */ - p->sleep_avg = ceiling; - /* - * Using INTERACTIVE_SLEEP() as a ceiling places a - * nice(0) task 1ms sleep away from promotion, and - * gives it 700ms to round-robin with no chance of - * being demoted. This is more than generous, so - * mark this sleep as non-interactive to prevent the - * on-runqueue bonus logic from intervening should - * this task not receive cpu immediately. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else { - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= ceiling) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - ceiling) { - p->sleep_avg = ceiling; - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - } - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } - - return effective_prio(p); -} - -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) + * activate_task - move a task to the runqueue */ static void activate_task(struct task_struct *p, struct rq *rq, int local) { - unsigned long long now; - - if (rt_task(p)) - goto out; + unsigned long long now = sched_clock(); - now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ @@ -1004,32 +796,9 @@ (now - p->timestamp) >> 20); } - p->prio = recalc_task_prio(p, now); - - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (p->sleep_type == SLEEP_NORMAL) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->sleep_type = SLEEP_INTERRUPTED; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->sleep_type = SLEEP_INTERACTIVE; - } - } + set_load_weight(p); + p->prio = effective_prio(p); p->timestamp = now; -out: __activate_task(p, rq); } @@ -1039,8 +808,7 @@ static void deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); - dequeue_task(p, p->array); - p->array = NULL; + dequeue_task(p, rq); } /* @@ -1133,7 +901,7 @@ * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!task_queued(p) && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -1159,7 +927,6 @@ { unsigned long flags; struct rq *rq; - struct prio_array *array; int running; repeat: @@ -1192,7 +959,6 @@ */ rq = task_rq_lock(p, &flags); running = task_running(rq, p); - array = p->array; task_rq_unlock(rq, &flags); /* @@ -1215,7 +981,7 @@ * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(array)) { + if (unlikely(task_queued(p))) { yield(); goto repeat; } @@ -1294,6 +1060,25 @@ } /* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} + +/* * find_idlest_group finds and returns the least busy CPU group within the * domain. */ @@ -1490,6 +1275,37 @@ } #endif +/* + * We need to have a special definition for an idle runqueue when testing + * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as + * a realtime task in sched_idle_next. + */ +#ifdef CONFIG_HOTPLUG_CPU +#define rq_idle(rq, curr) ((curr) == (rq)->idle && !rt_task(curr)) +#else +#define rq_idle(rq, curr) ((curr) == (rq)->idle) +#endif + +static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) +{ + struct task_struct *curr = rq->curr; + int preempts = 0; + + if (rq_idle(rq, curr)) + preempts = 1; + else if (p->prio < curr->prio) + preempts = 1; + else if (p->prio == curr->prio && p->deadline < rq->queued_deadline) + preempts = 1; + return preempts; +} + +static inline void try_preempt(struct task_struct *p, struct rq *rq) +{ + if (task_preempts_curr(p, rq)) + resched_task(rq->curr); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -1521,7 +1337,7 @@ if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; cpu = task_cpu(p); @@ -1614,7 +1430,7 @@ old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; this_cpu = smp_processor_id(); @@ -1623,26 +1439,10 @@ out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else /* - * Tasks that have marked their sleep as noninteractive get - * woken up with their sleep average not weighted in an - * interactive way. - */ - if (old_state & TASK_NONINTERACTIVE) - p->sleep_type = SLEEP_NONINTERACTIVE; - - - activate_task(p, rq, cpu == this_cpu); - /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on @@ -1650,10 +1450,9 @@ * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) + try_preempt(p, rq); success = 1; out_running: @@ -1676,7 +1475,6 @@ return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1704,7 +1502,6 @@ p->prio = current->normal_prio; INIT_LIST_HEAD(&p->run_list); - p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -1716,30 +1513,31 @@ /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + if (unlikely(p->policy == SCHED_FIFO)) + goto out; /* * Share the timeslice between parent and child, thus the * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. */ local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { + if (current->time_slice > 0) { + current->time_slice /= 2; + if (current->time_slice) + p->time_slice = current->time_slice; + else + p->time_slice = 1; /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); - } + p->first_time_slice = 1; + } else + p->time_slice = 0; + + p->timestamp = sched_clock(); local_irq_enable(); +out: put_cpu(); } @@ -1761,38 +1559,16 @@ this_cpu = smp_processor_id(); cpu = task_cpu(p); - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->prio = effective_prio(p); - if (likely(cpu == this_cpu)) { + activate_task(p, rq, 1); if (!(clone_flags & CLONE_VM)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - p->normal_prio = current->normal_prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - inc_nr_running(p, rq); - } set_need_resched(); - } else - /* Run child last */ - __activate_task(p, rq); + } /* * We skip the following code due to cpu == this_cpu * @@ -1809,19 +1585,16 @@ */ p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + rq->most_recent_timestamp; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + activate_task(p, rq, 0); + try_preempt(p, rq); /* * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: + * parent runqueue to update the parent's ->flags: */ task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); task_rq_unlock(this_rq, &flags); } @@ -1836,23 +1609,17 @@ */ void fastcall sched_exit(struct task_struct *p) { + struct task_struct *parent; unsigned long flags; struct rq *rq; - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); + parent = p->parent; + rq = task_rq_lock(parent, &flags); + if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { + parent->time_slice += p->time_slice; + if (unlikely(parent->time_slice > MS_TO_US(rr_interval))) + parent->time_slice = MS_TO_US(rr_interval); + } task_rq_unlock(rq, &flags); } @@ -2184,23 +1951,15 @@ * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct prio_array *src_array, - struct task_struct *p, struct rq *this_rq, - struct prio_array *this_array, int this_cpu) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { - dequeue_task(p, src_array); - dec_nr_running(p, src_rq); + deactivate_task(p, src_rq); set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); - enqueue_task(p, this_array); + __activate_task(p, this_rq); p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + this_rq->most_recent_timestamp; - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + try_preempt(p, this_rq); } /* @@ -2243,8 +2002,6 @@ return 1; } -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) - /* * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted * load from busiest to this_rq, as part of a balancing operation within @@ -2257,9 +2014,7 @@ struct sched_domain *sd, enum idle_type idle, int *all_pinned) { - int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, - best_prio_seen, skip_for_load; - struct prio_array *array, *dst_array; + int idx = 0, pulled = 0, pinned = 0; struct list_head *head, *curr; struct task_struct *tmp; long rem_load_move; @@ -2269,74 +2024,28 @@ rem_load_move = max_load_move; pinned = 1; - this_best_prio = rq_best_prio(this_rq); - best_prio = rq_best_prio(busiest); - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) of - * any task we find with that prio. - */ - best_prio_seen = best_prio == busiest->curr->prio; - - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: - /* Start searching at priority 0: */ - idx = 0; skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } + idx = find_next_bit(busiest->prio_bitmap, PRIO_LIMIT, idx); + if (idx >= PRIO_LIMIT) goto out; - } + head = busiest->queue + idx; + - head = array->queue + idx; curr = head->prev; skip_queue: tmp = list_entry(curr, struct task_struct, run_list); curr = curr->prev; - /* - * To help distribute high priority tasks accross CPUs we don't - * skip a task if it will be the highest priority task (i.e. smallest - * prio value) on its new queue regardless of its load weight - */ - skip_for_load = tmp->load_weight > rem_load_move; - if (skip_for_load && idx < this_best_prio) - skip_for_load = !best_prio_seen && idx == best_prio; - if (skip_for_load || - !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { - best_prio_seen |= idx == best_prio; if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(busiest, tmp, this_rq, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2345,8 +2054,6 @@ * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { - if (idx < this_best_prio) - this_best_prio = idx; if (curr != head) goto skip_queue; idx++; @@ -2743,6 +2450,7 @@ schedstat_add(sd, lb_imbalance[idle], imbalance); nr_moved = 0; + if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -3297,11 +3005,36 @@ /* * This is called on clock ticks and on context switches. * Bank in p->sched_time the ns elapsed since the last tick or switch. + * CPU scheduler quota accounting is also performed here in microseconds. + * The value returned from sched_clock() occasionally gives bogus values so + * some sanity checking is required. */ -static inline void -update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) +static void +update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, + int tick) { - p->sched_time += now - p->last_ran; + long time_diff = now - p->last_ran; + + if (tick) { + /* + * Called from scheduler_tick() there should be less than two + * jiffies worth, and not negative/overflow. + */ + if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) + time_diff = JIFFIES_TO_NS(1); + } else { + /* + * Called from context_switch there should be less than one + * jiffy worth, and not negative/overflow. There should be + * some time banked here so use a nominal 1us. + */ + if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) + time_diff = 1000; + } + /* time_slice accounting is done in usecs to avoid overflow on 32bit */ + if (p != rq->idle && p->policy != SCHED_FIFO) + p->time_slice -= time_diff / 1000; + p->sched_time += time_diff; p->last_ran = rq->most_recent_timestamp = now; } @@ -3322,27 +3055,6 @@ } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -static inline int expired_starving(struct rq *rq) -{ - if (rq->curr->static_prio > rq->best_expired_prio) - return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) - return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) - return 1; - return 0; -} - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3415,87 +3127,23 @@ cpustat->steal = cputime64_add(cpustat->steal, tmp); } +/* This manages tasks that have run out of timeslice during a scheduler_tick */ static void task_running_tick(struct rq *rq, struct task_struct *p) { - if (p->array != rq->active) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); + /* SCHED_FIFO tasks never run out of timeslice. */ + if (p->time_slice > 0 || p->policy == SCHED_FIFO) return; - } + /* p->time_slice <= 0 */ spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } - } -out_unlock: + if (likely(task_queued(p))) + requeue_task(rq, p); + set_tsk_need_resched(p); spin_unlock(&rq->lock); } /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. */ void scheduler_tick(void) { @@ -3505,7 +3153,7 @@ int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); - update_cpu_clock(p, rq, now); + update_cpu_clock(p, rq, now, 1); if (!idle_at_tick) task_running_tick(rq, p); @@ -3554,10 +3202,36 @@ #endif -static inline int interactive_sleep(enum sleep_type sleep_type) +/* + * Deadline is "now" in jiffies + (offset by priority). + */ +static inline void set_deadline(struct task_struct *p) { - return (sleep_type == SLEEP_INTERACTIVE || - sleep_type == SLEEP_INTERRUPTED); + if (p->time_slice > 0 || rt_task(p)) + return; + + reset_first_time_slice(p); + p->time_slice = MS_TO_US(rr_interval); + p->deadline = jiffies + (prio_ratio(p) * rr_interval / 100); +} + +static inline struct task_struct *earliest_deadline_task(struct rq *rq) +{ + struct list_head *tmp; + unsigned long earliest_deadline; + struct task_struct *edt, *p; + + + edt = list_entry(rq->gqueue->next, struct task_struct, run_list); + earliest_deadline = edt->deadline; + list_for_each(tmp, rq->gqueue) { + p = list_entry(tmp, struct task_struct, run_list); + if (p->deadline < earliest_deadline) { + earliest_deadline = p->deadline; + edt = p; + } + } + return edt; } /* @@ -3566,13 +3240,10 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - struct prio_array *array; - struct list_head *queue; unsigned long long now; - unsigned long run_time; - int cpu, idx, new_prio; long *switch_count; struct rq *rq; + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -3608,20 +3279,9 @@ schedstat_inc(rq, sched_cnt); now = sched_clock(); - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { - run_time = now - prev->timestamp; - if (unlikely((long long)(now - prev->timestamp) < 0)) - run_time = 0; - } else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); spin_lock_irq(&rq->lock); + set_deadline(prev); switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -3641,59 +3301,26 @@ idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; goto switch_tasks; } } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); - - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { - unsigned long long delta = now - next->timestamp; - if (unlikely((long long)(now - next->timestamp) < 0)) - delta = 0; - - if (next->sleep_type == SLEEP_INTERACTIVE) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - new_prio = recalc_task_prio(next, next->timestamp + delta); - - if (unlikely(next->prio != new_prio)) { - dequeue_task(next, array); - next->prio = new_prio; - enqueue_task(next, array); - } - } - next->sleep_type = SLEEP_NORMAL; + idx = find_first_bit(rq->prio_bitmap, PRIO_LIMIT); + if (likely(idx == NORMAL_PRIO)) + next = earliest_deadline_task(rq); + else + next = list_entry((rq->queue + idx)->next, struct task_struct, run_list); switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); + else + rq->queued_deadline = next->deadline; prefetch(next); prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - update_cpu_clock(prev, rq, now); - - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; + update_cpu_clock(prev, rq, now, 0); prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); @@ -3714,7 +3341,6 @@ finish_task_switch(this_rq(), prev); } else spin_unlock_irq(&rq->lock); - prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) goto need_resched_nonpreemptible; @@ -4129,29 +3755,22 @@ */ void rt_mutex_setprio(struct task_struct *p, int prio) { - struct prio_array *array; unsigned long flags; + int queued, oldprio; struct rq *rq; - int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); oldprio = p->prio; - array = p->array; - if (array) - dequeue_task(p, array); + queued = task_queued(p); + if (queued) + dequeue_task(p, rq); p->prio = prio; - if (array) { - /* - * If changing to an RT priority then queue it - * in the active array! - */ - if (rt_task(p)) - array = rq->active; - enqueue_task(p, array); + if (queued) { + enqueue_task(p, rq); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4160,8 +3779,8 @@ if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else + try_preempt(p, rq); } task_rq_unlock(rq, &flags); } @@ -4170,8 +3789,7 @@ void set_user_nice(struct task_struct *p, long nice) { - struct prio_array *array; - int old_prio, delta; + int queued, old_prio,delta; unsigned long flags; struct rq *rq; @@ -4192,20 +3810,20 @@ p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; - if (array) { - dequeue_task(p, array); + queued = task_queued(p); + if (queued) { + dequeue_task(p, rq); dec_raw_weighted_load(rq, p); } p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); old_prio = p->prio; p->prio = effective_prio(p); + set_load_weight(p); delta = p->prio - old_prio; - if (array) { - enqueue_task(p, array); + if (queued) { + enqueue_task(p, rq); inc_raw_weighted_load(rq, p); /* * If the task increased its priority or is running and @@ -4281,11 +3899,14 @@ * * This is the priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. + * around 1, value goes from 0 to +79. Values higher than + * 39 indicate task is on the expired array. This is done + * lockless and may rarely return an active instead of + * expired value. */ -int task_prio(const struct task_struct *p) +int task_prio(struct task_struct *p) { - return p->prio - MAX_RT_PRIO; + return p->static_prio - MAX_RT_PRIO; } /** @@ -4328,18 +3949,13 @@ /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(task_queued(p)); p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; set_load_weight(p); } @@ -4354,8 +3970,7 @@ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1; - struct prio_array *array; + int queued, retval, oldprio, oldpolicy = -1; unsigned long flags; struct rq *rq; @@ -4429,12 +4044,12 @@ spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - array = p->array; - if (array) + queued = task_queued(p); + if (queued) deactivate_task(p, rq); oldprio = p->prio; __setscheduler(p, policy, param->sched_priority); - if (array) { + if (queued) { __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and @@ -4444,8 +4059,8 @@ if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else + try_preempt(p, rq); } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4716,42 +4331,20 @@ /** * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this - * CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); - struct prio_array *array = current->array, *target = rq->expired; + struct task_struct *p = current; schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; - - if (array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* - * requeue_task is cheaper so perform that if possible. - */ - requeue_task(current, array); + if (rq->nr_running == 1) + schedstat_inc(rq, yld_both_empty); + else { + if (!rt_task(p)) + p->deadline = rq->latest_deadline + 1; + requeue_task(rq, p); + } /* * Since we are going to call schedule() anyway, there's @@ -4959,8 +4552,8 @@ if (retval) goto out_unlock; - jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : task_timeslice(p), &t); + t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : + MS_TO_NS(task_timeslice(p))); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -5056,10 +4649,7 @@ struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = sched_clock(); - idle->sleep_avg = 0; - idle->array = NULL; - idle->prio = idle->normal_prio = MAX_PRIO; + idle->timestamp = idle->last_ran = sched_clock(); idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -5178,7 +4768,7 @@ goto out; set_task_cpu(p, dest_cpu); - if (p->array) { + if (task_queued(p)) { /* * Sync timestamp with rq_dest's before activating. * The same thing could be achieved by doing this step @@ -5189,8 +4779,7 @@ + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); __activate_task(p, rq_dest); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + try_preempt(p, rq_dest); } ret = 1; out: @@ -5415,16 +5004,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - unsigned int arr, i; + unsigned int i; - for (arr = 0; arr < 2; arr++) { - for (i = 0; i < MAX_PRIO; i++) { - struct list_head *list = &rq->arrays[arr].queue[i]; - - while (!list_empty(list)) - migrate_dead(dead_cpu, list_entry(list->next, - struct task_struct, run_list)); - } + for (i = 0; i < PRIO_LIMIT; i++) { + struct list_head *list = rq->queue + i; + + while (!list_empty(list)) + migrate_dead(dead_cpu, list_entry(list->next, + struct task_struct, run_list)); } } #endif /* CONFIG_HOTPLUG_CPU */ @@ -5487,7 +5074,7 @@ /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); deactivate_task(rq->idle, rq); - rq->idle->static_prio = MAX_PRIO; + rq->idle->static_prio = NICE_TO_PRIO(0); __setscheduler(rq->idle, SCHED_NORMAL, 0); migrate_dead_tasks(cpu); task_rq_unlock(rq, &flags); @@ -7013,6 +6600,13 @@ /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); + + /* + * Assume that every added cpu gives us slightly less overall latency + * allowing us to increase the base rr_interval, but in a non linear + * fashion. + */ + rr_interval *= 1 + ilog2(num_online_cpus()); } #else void __init sched_init_smp(void) @@ -7032,20 +6626,24 @@ void __init sched_init(void) { - int i, j, k; + int i, j; int highest_cpu = 0; + prio_ratios[0] = prio_multiples[39] = 100; + for (i = 1 ; i <= PRIO_RANGE ; i++) + prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; + for (i = 38 ; i >= 0 ; i--) + prio_multiples[i] = prio_multiples[i + 1] * 11 / 10; + for_each_possible_cpu(i) { - struct prio_array *array; struct rq *rq; rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + rq->queued_deadline = 0; + rq->latest_deadline = 0; #ifdef CONFIG_SMP rq->sd = NULL; @@ -7059,18 +6657,14 @@ #endif atomic_set(&rq->nr_iowait, 0); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } + for (j = 0; j < PRIO_LIMIT; j++) + INIT_LIST_HEAD(rq->queue + j); + bitmap_zero(rq->prio_bitmap, PRIO_LIMIT); + /* delimiter for bitsearch */ + __set_bit(PRIO_LIMIT, rq->prio_bitmap); + rq->gqueue = rq->queue + NORMAL_PRIO; highest_cpu = i; } - set_load_weight(&init_task); #ifdef CONFIG_SMP @@ -7125,10 +6719,10 @@ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) { - struct prio_array *array; struct task_struct *g, *p; unsigned long flags; struct rq *rq; + int queued; read_lock_irq(&tasklist_lock); @@ -7139,11 +6733,11 @@ spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); - array = p->array; - if (array) + queued = task_queued(p); + if (queued) deactivate_task(p, task_rq(p)); __setscheduler(p, SCHED_NORMAL, 0); - if (array) { + if (queued) { __activate_task(p, task_rq(p)); resched_task(rq->curr); } Index: linux-2.6.22-bfs/kernel/sysctl.c =================================================================== --- linux-2.6.22-bfs.orig/kernel/sysctl.c 2009-08-13 14:39:47.341620616 +1000 +++ linux-2.6.22-bfs/kernel/sysctl.c 2009-08-13 14:40:31.237032547 +1000 @@ -78,6 +78,7 @@ extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int rr_interval; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -161,6 +162,14 @@ #endif +/* Constants for minimum and maximum testing. + We use these as one-element integer vectors. */ +static int __read_mostly zero; +static int __read_mostly one = 1; +static int __read_mostly one_hundred = 100; +static int __read_mostly five_thousand = 5000; + + /* The default sysctl tables: */ static ctl_table root_table[] = { @@ -501,6 +510,17 @@ .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rr_interval", + .data = &rr_interval, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &five_thousand, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC, @@ -619,12 +639,6 @@ { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY, Index: linux-2.6.22-bfs/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.22-bfs.orig/Documentation/sysctl/kernel.txt 2009-08-13 14:39:47.321626373 +1000 +++ linux-2.6.22-bfs/Documentation/sysctl/kernel.txt 2009-08-13 14:40:31.282261335 +1000 @@ -43,6 +43,7 @@ - printk - real-root-dev ==> Documentation/initrd.txt - reboot-cmd [ SPARC only ] +- rr_interval - rtsig-max - rtsig-nr - sem @@ -288,6 +289,19 @@ ============================================================== +rr_interval: + +This is the smallest duration that any cpu process scheduling unit +will run for. Increasing this value can increase throughput of cpu +bound tasks substantially but at the expense of increased latencies +overall. This value is in milliseconds and the default value chosen +depends on the number of cpus available at scheduler initialisation +with a minimum of 6. + +Valid values are from 1-5000. + +============================================================== + rtsig-max & rtsig-nr: The file rtsig-max can be used to tune the maximum number Index: linux-2.6.22-bfs/fs/pipe.c =================================================================== --- linux-2.6.22-bfs.orig/fs/pipe.c 2009-08-13 14:39:47.303620754 +1000 +++ linux-2.6.22-bfs/fs/pipe.c 2009-08-13 14:40:31.299139830 +1000 @@ -41,12 +41,7 @@ { DEFINE_WAIT(wait); - /* - * Pipes are system-local resources, so sleeping on them - * is considered a noninteractive wait: - */ - prepare_to_wait(&pipe->wait, &wait, - TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); Index: linux-2.6.22-bfs/fs/proc/array.c =================================================================== --- linux-2.6.22-bfs.orig/fs/proc/array.c 2009-08-13 14:39:47.309620486 +1000 +++ linux-2.6.22-bfs/fs/proc/array.c 2009-08-13 14:40:31.315974168 +1000 @@ -165,7 +165,6 @@ rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +172,6 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, Index: linux-2.6.22-bfs/include/linux/init_task.h =================================================================== --- linux-2.6.22-bfs.orig/include/linux/init_task.h 2009-08-13 14:39:47.326621254 +1000 +++ linux-2.6.22-bfs/include/linux/init_task.h 2009-08-13 14:40:31.332907824 +1000 @@ -122,16 +122,17 @@ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ + .prio = NORMAL_PRIO, \ .static_prio = MAX_PRIO-20, \ - .normal_prio = MAX_PRIO-20, \ + .normal_prio = NORMAL_PRIO, \ + .deadline = 0, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ + .time_slice = 10000, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ Index: linux-2.6.22-bfs/kernel/softirq.c =================================================================== --- linux-2.6.22-bfs.orig/kernel/softirq.c 2009-08-13 14:39:47.351620596 +1000 +++ linux-2.6.22-bfs/kernel/softirq.c 2009-08-13 14:43:00.214907216 +1000 @@ -488,7 +488,6 @@ static int ksoftirqd(void * __bind_cpu) { - set_user_nice(current, 19); current->flags |= PF_NOFREEZE; set_current_state(TASK_INTERRUPTIBLE); Index: linux-2.6.22-bfs/kernel/workqueue.c =================================================================== --- linux-2.6.22-bfs.orig/kernel/workqueue.c 2009-08-13 14:39:47.357620604 +1000 +++ linux-2.6.22-bfs/kernel/workqueue.c 2009-08-13 14:40:31.366595096 +1000 @@ -285,8 +285,6 @@ if (!cwq->wq->freezeable) current->flags |= PF_NOFREEZE; - set_user_nice(current, -5); - for (;;) { prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); if (!freezing(current) && Index: linux-2.6.22-bfs/kernel/kthread.c =================================================================== --- linux-2.6.22-bfs.orig/kernel/kthread.c 2009-08-13 14:39:47.346623243 +1000 +++ linux-2.6.22-bfs/kernel/kthread.c 2009-08-13 14:40:31.383463591 +1000 @@ -223,7 +223,6 @@ ignore_signals(tsk); - set_user_nice(tsk, -5); set_cpus_allowed(tsk, CPU_MASK_ALL); } Index: linux-2.6.22-bfs/Documentation/sched-design.txt =================================================================== --- linux-2.6.22-bfs.orig/Documentation/sched-design.txt 2009-08-13 14:40:41.322907863 +1000 +++ linux-2.6.22-bfs/Documentation/sched-design.txt 2009-08-13 14:42:23.943907794 +1000 @@ -163,3 +163,14 @@ code is smaller than the old one. Ingo + + +All replaced with the BrainFuck Scheduler. + +Ridiculously simple code implementing an earliest deadline first, +starvation free, strict fairness O(n) lookup policy. + +One tunable only, rr_interval in milliseconds. +see: Documentation/sysctl/kernel.txt + +-ck