--- linux-2.6.5-rc2-mm2/include/linux/sched.h 2004-03-25 00:41:25.173117839 +1100 +++ linux-2.6.5-rc2-mm2-stair/include/linux/sched.h 2004-03-25 00:42:29.832829580 +1100 @@ -390,14 +390,13 @@ struct task_struct { struct list_head run_list; prio_array_t *array; - unsigned long sleep_avg; - long interactive_credit; unsigned long long timestamp; - int activated; + unsigned long runtime; + unsigned int deadline; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + unsigned int slice, time_slice, first_time_slice; struct list_head tasks; struct list_head ptrace_children; --- linux-2.6.5-rc2-mm2/fs/proc/array.c 2004-03-25 00:41:25.012143455 +1100 +++ linux-2.6.5-rc2-mm2-stair/fs/proc/array.c 2004-03-25 00:42:29.831829739 +1100 @@ -154,7 +154,6 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -162,7 +161,6 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1000000000/1024), p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, --- linux-2.6.5-rc2-mm2/kernel/sched.c 2004-03-25 00:41:25.212111634 +1100 +++ linux-2.6.5-rc2-mm2-stair/kernel/sched.c 2004-03-25 00:58:33.546517137 +1100 @@ -15,6 +15,9 @@ * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-03-19. New staircase scheduling policy by Con Kolivas with help + * from Zwane Mwaikambo and useful suggestions by + * William Lee Irwin III. */ #include @@ -64,121 +67,17 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ - (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ +//This is the time all tasks within the same priority round robin. +#define RR_INTERVAL (((10 * HZ / 1000) ? : 1) * num_online_cpus()) + #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -/* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, - * maximum timeslice is 200 msecs. Timeslices get refilled after - * they expire. - */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define CREDIT_LIMIT 100 - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) - -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) - #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) /* - * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] - * to time slice values. - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - * - * task_timeslice() is the interface that is used by the scheduler. - */ - -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * \ - (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) - -static inline unsigned int task_timeslice(task_t *p) -{ - return BASE_TIMESLICE(p); -} - -/* * These are the runqueue data structures: */ @@ -189,7 +88,7 @@ typedef struct runqueue runqueue_t; struct prio_array { unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; + struct list_head queue[MAX_PRIO + 1]; }; /* @@ -211,12 +110,11 @@ struct runqueue { unsigned long cpu_load; #endif unsigned long long nr_switches; - unsigned long expired_timestamp, nr_uninterruptible; + unsigned long nr_uninterruptible; unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; + prio_array_t array; atomic_t nr_iowait; /* For active balancing */ @@ -305,16 +203,18 @@ static inline void rq_unlock(runqueue_t /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static inline void dequeue_task(struct task_struct *p, runqueue_t *rq) { + prio_array_t* array = &rq->array; array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static inline void enqueue_task(struct task_struct *p, runqueue_t *rq) { + prio_array_t* array = &rq->array; list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; @@ -322,155 +222,101 @@ static inline void enqueue_task(struct t } /* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. + * __activate_task - move a task to the runqueue. */ -static int effective_prio(task_t *p) +static inline void __activate_task(task_t *p, runqueue_t *rq) { - int bonus, prio; + enqueue_task(p, rq); + nr_running_inc(rq); +} - if (rt_task(p)) - return p->prio; +// deadline - the best deadline rank a task can have. +static inline unsigned int deadline(task_t *p) +{ + unsigned int deadline; + if (unlikely(rt_task(p))) + return p->deadline; + deadline = 40 - TASK_USER_PRIO(p); + return deadline; +} - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; +// slice - the duration a task runs before losing a deadline rank. +static inline unsigned int slice(task_t *p) +{ + unsigned int slice = RR_INTERVAL; + if (likely(!rt_task(p))) + slice *= deadline(p); + return slice; +} - prio = p->static_prio - bonus; +// effective_prio - dynamic priority dependent on deadline rank. +static inline int effective_prio(task_t *p) +{ + unsigned int prio; + if (unlikely(rt_task(p))) + return p->prio; + + prio = MAX_PRIO - 1 - p->deadline; if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; return prio; } /* - * __activate_task - move a task to the runqueue. + * first_time_slice - is the duration a task spends at the highest step + * on the "staircase" of the scheduler. If it's deadline rank is less + * than the best rank the duration is longer to maintain appropriate + * cpu distribution. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static inline unsigned int first_time_slice(task_t *p) { - enqueue_task(p, rq->active); - nr_running_inc(rq); + unsigned int time_slice = RR_INTERVAL; + if (likely(!rt_task(p))) + time_slice *= ((deadline(p) - p->deadline) + 1); + return time_slice; } -static void recalc_task_prio(task_t *p, unsigned long long now) +static inline int apparent_prio(task_t *p) { - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - AVG_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. - */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = JIFFIES_TO_NS(task_timeslice(p)); - - /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O - */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) { - p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } - } - } - - p->prio = effective_prio(p); + return (MAX_PRIO - 1 - (p->slice / RR_INTERVAL)); } /* * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) */ static inline void activate_task(task_t *p, runqueue_t *rq) { unsigned long long now = sched_clock(); - - recalc_task_prio(p, now); - - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (!p->activated) { + unsigned long run_time, sleep_time; + unsigned int full_slice = slice(p); + if (likely(!rt_task(p))) { /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: + * This ensures that tasks running for less than one tick are + * treated the same as longer running tasks. */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; + sleep_time = now - p->timestamp; + if (sleep_time > p->runtime) + p->runtime = 0; + else + p->runtime -= sleep_time; + run_time = NS_TO_JIFFIES(p->runtime); + + if (unlikely(run_time >= full_slice)) { + p->slice = full_slice; + p->runtime = 0; + } else if (run_time) { + p->slice = full_slice - run_time; + p->prio = apparent_prio(p); + } else { + p->slice = full_slice; + if (p->deadline < deadline(p)) + p->deadline++; + p->prio = effective_prio(p); } - } + } else + p->slice = full_slice; + p->time_slice = first_time_slice(p); p->timestamp = now; - __activate_task(p, rq); } @@ -480,9 +326,13 @@ static inline void activate_task(task_t static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { nr_running_dec(rq); - if (p->state == TASK_UNINTERRUPTIBLE) + if (p->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible++; - dequeue_task(p, p->array); + if (p->deadline && p->deadline > (deadline(p) - 2) && p->mm) + // maximum deadline rank limited while waiting on i/o. + p->deadline--; + } + dequeue_task(p, rq); p->array = NULL; } @@ -760,14 +610,8 @@ repeat_lock_task: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } if (sync && cpu == this_cpu) { __activate_task(p, rq); @@ -830,12 +674,14 @@ void fastcall sched_fork(task_t *p) */ local_irq_disable(); p->time_slice = (current->time_slice + 1) >> 1; + p->slice = (current->slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ p->first_time_slice = 1; current->time_slice >>= 1; + current->slice >>= 1; p->timestamp = sched_clock(); if (!current->time_slice) { /* @@ -863,33 +709,20 @@ void fastcall wake_up_forked_process(tas unsigned long flags; runqueue_t *rq = task_rq_lock(current, &flags); + // Forked process gets a lower deadline rank to prevent fork bombs. + if (p->deadline) + p->deadline--; + + // Deadline rank on kernel threads is fixed at best. + if (unlikely(!p->mm)) + p->deadline = deadline(p); BUG_ON(p->state != TASK_RUNNING); - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. - */ - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->interactive_credit = 0; - - p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - nr_running_inc(rq); - } + p->prio = effective_prio(p); + p->runtime = 0; + __activate_task(p, rq); task_rq_unlock(rq, &flags); } @@ -910,19 +743,11 @@ void fastcall sched_exit(task_t * p) local_irq_save(flags); if (p->first_time_slice) { p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) - p->parent->time_slice = MAX_TIMESLICE; + if (unlikely(p->parent->time_slice > slice(p->parent))) + p->parent->time_slice = slice(p->parent); } local_irq_restore(flags); - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ rq = task_rq_lock(p->parent, &flags); - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); task_rq_unlock(rq, &flags); } @@ -1205,15 +1030,14 @@ static inline void double_lock_balance(r * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, - task_t *p, runqueue_t *this_rq, prio_array_t *this_array, - int this_cpu) +static inline void pull_task(runqueue_t *src_rq, task_t *p, + runqueue_t *this_rq, int this_cpu) { - dequeue_task(p, src_array); + dequeue_task(p, src_rq); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); nr_running_inc(this_rq); - enqueue_task(p, this_array); + enqueue_task(p, this_rq); p->timestamp = sched_clock() - (src_rq->timestamp_last_tick - p->timestamp); /* @@ -1266,28 +1090,15 @@ static int move_tasks(runqueue_t *this_r { int idx; int pulled = 0; - prio_array_t *array, *dst_array; + prio_array_t* array, *dst_array; struct list_head *head, *curr; task_t *tmp; if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; + array = &busiest->array; + dst_array = &this_rq->array; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: /* Start searching at priority 0: */ idx = 0; skip_bitmap: @@ -1295,14 +1106,8 @@ skip_bitmap: idx = sched_find_first_bit(array->bitmap); else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } + if (idx >= MAX_PRIO) goto out; - } head = array->queue + idx; curr = head->prev; @@ -1317,7 +1122,7 @@ skip_queue: idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(busiest, tmp, this_rq, this_cpu); pulled++; /* We only want to steal up to the prescribed number of tasks. */ @@ -1783,21 +1588,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); -/* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) /* * This function gets called by the timer code, with HZ frequency. @@ -1844,76 +1634,45 @@ void scheduler_tick(int user_ticks, int cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->array != &rq->array) { set_tsk_need_resched(p); goto out; } spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (unlikely(rt_task(p))) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); - } + + // SCHED_FIFO tasks never run out of timeslice. + if (unlikely(p->policy == SCHED_FIFO)) + goto out_unlock; + // Tasks lose a deadline rank each time they use up a full slice(). + if (!--p->slice) { + set_tsk_need_resched(p); + dequeue_task(p, rq); + if (p->deadline && p->mm) + p->deadline--; + p->slice = slice(p); + p->prio = effective_prio(p); + p->time_slice = first_time_slice(p); + enqueue_task(p, rq); + p->first_time_slice = 0; goto out_unlock; } + /* + * Tasks that run out of time_slice but still have slice left get + * requeued with a lower priority && RR_INTERVAL time_slice. + */ if (!--p->time_slice) { - dequeue_task(p, rq->active); set_tsk_need_resched(p); + dequeue_task(p, rq); p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); - } + p->time_slice = RR_INTERVAL; + enqueue_task(p, rq); + goto out_unlock; + } + // All tasks within a priority level round robin at RR_INTERVAL. + if (!(p->slice % RR_INTERVAL)) { + set_tsk_need_resched(p); + dequeue_task(p, rq); + enqueue_task(p, rq); } out_unlock: spin_unlock(&rq->lock); @@ -1977,8 +1736,8 @@ static inline int dependent_sleeper(int * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && + if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) > + slice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) ret |= 1; @@ -1987,8 +1746,8 @@ static inline int dependent_sleeper(int * or wake it up if it has been put to sleep for priority * reasons. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && + if ((((p->slice * (100 - sd->per_cpu_gain) / 100) > + slice(smt_curr) || rt_task(p)) && smt_curr->mm && p->mm && !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) resched_task(smt_curr); @@ -2016,7 +1775,7 @@ asmlinkage void schedule(void) long *switch_count; task_t *prev, *next; runqueue_t *rq; - prio_array_t *array; + prio_array_t* array; struct list_head *queue; unsigned long long now; unsigned long run_time; @@ -2041,18 +1800,7 @@ need_resched: release_kernel_lock(prev); now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status - */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); + run_time = now - prev->timestamp; spin_lock_irq(&rq->lock); @@ -2077,58 +1825,34 @@ need_resched: #endif if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; wake_sleeping_dependent(cpu, rq); goto switch_tasks; } } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - + array = &rq->array; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); - if (dependent_sleeper(cpu, rq, next)) { + if (dependent_sleeper(cpu, rq, next)) next = rq->idle; - goto switch_tasks; - } - - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); - } - next->activated = 0; switch_tasks: + if (!NS_TO_JIFFIES(run_time)) + // Keeps track of tasks that run less than one tick + prev->runtime += run_time; + else { + if (run_time > prev->runtime) + prev->runtime = 0; + else + prev->runtime -= run_time; + } + prev->timestamp = now; + prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) { - prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } - prev->timestamp = now; - if (likely(prev != next)) { next->timestamp = now; rq->nr_switches++; @@ -2394,7 +2118,7 @@ void scheduling_functions_end_here(void) void set_user_nice(task_t *p, long nice) { unsigned long flags; - prio_array_t *array; + prio_array_t* array; runqueue_t *rq; int old_prio, new_prio, delta; @@ -2417,7 +2141,7 @@ void set_user_nice(task_t *p, long nice) } array = p->array; if (array) - dequeue_task(p, array); + dequeue_task(p, rq); old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); @@ -2426,7 +2150,7 @@ void set_user_nice(task_t *p, long nice) p->prio += delta; if (array) { - enqueue_task(p, array); + enqueue_task(p, rq); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -2555,7 +2279,7 @@ static int setscheduler(pid_t pid, int p struct sched_param lp; int retval = -EINVAL; int oldprio; - prio_array_t *array; + prio_array_t* array; unsigned long flags; runqueue_t *rq; task_t *p; @@ -2822,29 +2546,26 @@ out_unlock: /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this + * this function yields the current CPU by transiently requeuing + * at lower priority. If there are no other threads running on this * CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - prio_array_t *target = rq->expired; - - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (unlikely(rt_task(current))) - target = rq->active; - - dequeue_task(current, array); - enqueue_task(current, target); - + + dequeue_task(current, rq); + if (likely(!rt_task(current))) { + current->prio++; + if (current->prio >= MAX_PRIO) + current->prio = MAX_PRIO - 1; + if (current->deadline < deadline(current)) + current->deadline++; + } + current->slice = slice(current); + current->time_slice = current->slice; + enqueue_task(current, rq); + /* * Since we are going to call schedule() anyway, there's * no need to preempt: @@ -2982,7 +2703,7 @@ long sys_sched_rr_get_interval(pid_t pid goto out_unlock; jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p), &t); + 0 : slice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -3103,6 +2824,7 @@ void __init init_idle(task_t *idle, int idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; + idle->deadline = 0; set_task_cpu(idle, cpu); double_rq_unlock(idle_rq, rq); set_tsk_need_resched(idle); @@ -3591,10 +3313,10 @@ void __init sched_init_smp(void) void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i, k; for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; + prio_array_t* array; #ifdef CONFIG_SMP struct sched_domain *domain; domain = cpu_sched_domain(i); @@ -3602,23 +3324,19 @@ void __init sched_init(void) #endif rq = cpu_rq(i); - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + array = &rq->array; + + for (k = 0; k <= MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); } /* * We have to do a little magic to get the first