--- include/linux/sched.h | 2 kernel/sched.c | 129 +++++++++++++++++++++++++------------------------- 2 files changed, 67 insertions(+), 64 deletions(-) Index: linux-2.6.16-ck2/kernel/sched.c =================================================================== --- linux-2.6.16-ck2.orig/kernel/sched.c 2006-03-30 20:39:34.000000000 +1000 +++ linux-2.6.16-ck2/kernel/sched.c 2006-03-31 22:50:07.000000000 +1000 @@ -16,9 +16,9 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2006-03-16 New staircase scheduling policy by Con Kolivas with help + * 2006-03-31 New staircase scheduling policy by Con Kolivas with help * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. - * Staircase v14.2 + * Staircase v14.2_test3 */ #include @@ -64,7 +64,7 @@ #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - +#define MIN_USER_PRIO (MAX_PRIO - 2) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, @@ -77,9 +77,9 @@ /* * Some helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) #define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ +#define NS_TO_JIFFIES(TIME) ((TIME) / NSJIFFY) +#define JIFFIES_TO_NS(TIME) ((TIME) * NSJIFFY) #define TASK_PREEMPTS_CURR(p, rq) ((p)->prio < (rq)->curr->prio) int sched_compute __read_mostly = 0; @@ -508,7 +508,7 @@ static unsigned long ns_diff(const unsig const unsigned long long v2) { unsigned long long vdiff; - if (likely(v1 > v2)) { + if (likely(v1 >= v2)) { vdiff = v1 - v2; #if BITS_PER_LONG < 64 if (vdiff > (1 << 31)) @@ -550,9 +550,16 @@ static void fastcall enqueue_task(task_t * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. */ -static inline void requeue_task(task_t *p, runqueue_t *rq) +static void requeue_task(task_t *p, runqueue_t *rq, const int prio) { - list_move_tail(&p->run_list, rq->queue + p->prio); + list_move_tail(&p->run_list, rq->queue + prio); + if (p->prio != prio) { + if (list_empty(rq->queue + p->prio)) + __clear_bit(p->prio, rq->bitmap); + p->prio = prio; + __set_bit(prio, rq->bitmap); + } + p->ns_debit = 0; } static inline void enqueue_task_head(task_t *p, runqueue_t *rq) @@ -681,20 +688,18 @@ static unsigned int fastcall slice(const static void fastcall inc_bonus(task_t *p, const unsigned long totalrun, const unsigned long sleep) { - unsigned int best_bonus; + unsigned int best_bonus = sleep / (totalrun + 1); - best_bonus = sleep / (totalrun + 1); if (p->bonus >= best_bonus) return; - - p->bonus++; best_bonus = bonus(p); - if (p->bonus > best_bonus) - p->bonus = best_bonus; + if (p->bonus < best_bonus) + p->bonus++; } static void dec_bonus(task_t *p) { + p->totalrun = 0; if (p->bonus) p->bonus--; } @@ -740,7 +745,7 @@ static int effective_prio(task_t *p) */ p->time_slice = p->slice % RR_INTERVAL() ? : RR_INTERVAL(); - return MAX_PRIO - 2; + return MIN_USER_PRIO; } return MAX_PRIO - 1; } @@ -756,8 +761,8 @@ static int effective_prio(task_t *p) rr = rr_interval(p); prio += used_slice / rr; - if (prio > MAX_PRIO - 2) - prio = MAX_PRIO - 2; + if (prio > MIN_USER_PRIO) + prio = MIN_USER_PRIO; return prio; } @@ -765,13 +770,14 @@ static inline void continue_slice(task_t { unsigned long total_run = NS_TO_JIFFIES(p->totalrun); - if (total_run >= p->slice) { - p->totalrun -= JIFFIES_TO_NS(p->slice); + if (total_run >= p->slice || p->prio == MIN_USER_PRIO) dec_bonus(p); - } else { - unsigned int remainder; + else { + unsigned long remainder; p->slice -= total_run; + if (p->slice <= p->time_slice) + dec_bonus(p); remainder = p->slice % rr_interval(p); if (remainder) p->time_slice = remainder; @@ -785,34 +791,35 @@ static inline void continue_slice(task_t */ static inline void recalc_task_prio(task_t *p, const unsigned long long now) { + /* Double the systime to account for missed sub-jiffy time */ + unsigned long ns_systime = JIFFIES_TO_NS(p->systime) * 2; unsigned long sleep_time = ns_diff(now, p->timestamp); /* - * Add the total for this last scheduled run (p->runtime) to the - * running total so far used (p->totalrun). - */ - p->totalrun += p->runtime; + * Add the total for this last scheduled run (p->runtime) and system + * time (p->systime) done on behalf of p to the running total so far + * used (p->totalrun). + */ + p->totalrun += p->runtime + ns_systime; + + /* systime is unintentionally seen as sleep, subtract it */ + if (likely(ns_systime < sleep_time)) + sleep_time -= ns_systime; + else + sleep_time = 0; /* * If we sleep longer than our running total and have not set the * PF_NONSLEEP flag we gain a bonus. */ - if (sleep_time >= p->totalrun && !(p->flags & PF_NONSLEEP) && - !sched_compute) { - inc_bonus(p, p->totalrun, sleep_time); - p->totalrun = 0; - return; + if (sleep_time >= p->totalrun && !(p->flags & PF_NONSLEEP)) { + inc_bonus(p, p->totalrun, sleep_time); + p->totalrun = 0; + return; } - /* - * If we have not set the PF_NONSLEEP flag we elevate priority by the - * amount of time we slept. - */ - if (p->flags & PF_NONSLEEP) - p->flags &= ~PF_NONSLEEP; - else - p->totalrun -= sleep_time; - + /* We elevate priority by the amount of time we slept. */ + p->totalrun -= sleep_time; continue_slice(p); } @@ -840,6 +847,7 @@ static void activate_task(task_t *p, run if (!rt_task(p)) { recalc_task_prio(p, now); p->flags &= ~PF_NONSLEEP; + p->systime = 0; p->prio = effective_prio(p); } p->timestamp = now; @@ -1221,11 +1229,15 @@ static inline int wake_idle(const int cp */ static void fastcall preempt(const task_t *p, runqueue_t *rq) { - if (p->prio >= rq->curr->prio) + task_t *curr = rq->curr; + + if (p->prio >= curr->prio) return; - if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || - !p->mm || rt_task(p)) - resched_task(rq->curr); + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || !p->mm || + rt_task(p) || curr == rq->idle) { + resched_task(curr); + return; + } rq->preempted = 1; } @@ -1449,21 +1461,20 @@ void fastcall wake_up_new_task(task_t *p this_cpu = smp_processor_id(); cpu = task_cpu(p); - /* - * Forked process gets no bonus to prevent fork bombs. - */ + /* Forked process gets no bonus to prevent fork bombs. */ p->bonus = 0; + current->flags |= PF_NONSLEEP; if (likely(cpu == this_cpu)) { - current->flags |= PF_NONSLEEP; activate_task(p, rq, 1); - if (!(clone_flags & CLONE_VM)) + if (!(clone_flags & CLONE_VM)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ set_need_resched(); + } /* * We skip the following code due to cpu == this_cpu * @@ -1489,7 +1500,6 @@ void fastcall wake_up_new_task(task_t *p */ task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); - current->flags |= PF_NONSLEEP; } task_rq_unlock(this_rq, &flags); } @@ -2522,6 +2532,7 @@ void account_system_time(struct task_str else cpustat->idle = cputime64_add(cpustat->idle, tmp); + p->systime++; /* Account for system time used */ acct_update_integrals(p); } @@ -2550,10 +2561,8 @@ void account_steal_time(struct task_stru static void time_slice_expired(task_t *p, runqueue_t *rq) { set_tsk_need_resched(p); - dequeue_task(p, rq); - p->prio = effective_prio(p); p->time_slice = rr_interval(p); - enqueue_task(p, rq); + requeue_task(p, rq, effective_prio(p)); } /* @@ -2639,7 +2648,6 @@ void scheduler_tick(void) dec_bonus(p); p->slice = slice(p); time_slice_expired(p, rq); - p->totalrun = 0; goto out_unlock; } /* @@ -2758,7 +2766,7 @@ static int dependent_sleeper(const int t goto out_unlock; p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next, - task_t, run_list); + task_t, run_list); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); @@ -3471,8 +3479,8 @@ void set_user_nice(task_t *p, const long * lowered its priority, then reschedule its CPU: */ if (delta < 0 || ((delta > 0 || idleprio_task(p)) && - task_running(rq, p))) - resched_task(rq->curr); + task_running(rq, p))) + resched_task(rq->curr); } out_unlock: task_rq_unlock(rq, &flags); @@ -3973,14 +3981,9 @@ asmlinkage long sys_sched_yield(void) current->slice = slice(current); current->time_slice = rr_interval(current); if (likely(!rt_task(current) && !idleprio_task(current))) - newprio = MAX_PRIO - 2; + newprio = MIN_USER_PRIO; - if (newprio != current->prio) { - dequeue_task(current, rq); - current->prio = newprio; - enqueue_task(current, rq); - } else - requeue_task(current, rq); + requeue_task(current, rq, newprio); /* * Since we are going to call schedule() anyway, there's Index: linux-2.6.16-ck2/include/linux/sched.h =================================================================== --- linux-2.6.16-ck2.orig/include/linux/sched.h 2006-03-31 22:49:35.000000000 +1000 +++ linux-2.6.16-ck2/include/linux/sched.h 2006-03-31 22:49:52.000000000 +1000 @@ -739,7 +739,7 @@ struct task_struct { unsigned short ioprio; unsigned long long timestamp; - unsigned long runtime, totalrun, ns_debit; + unsigned long runtime, totalrun, ns_debit, systime; unsigned int bonus; unsigned int slice, time_slice; unsigned long long sched_time; /* sched_clock time spent running */