diff -Naurp linux-2.6.0-test2/arch/i386/kernel/smpboot.c linux-2.6.0-test2-O13/arch/i386/kernel/smpboot.c --- linux-2.6.0-test2/arch/i386/kernel/smpboot.c 2003-05-30 22:39:08.000000000 +1000 +++ linux-2.6.0-test2-O13/arch/i386/kernel/smpboot.c 2003-08-05 01:55:08.000000000 +1000 @@ -915,13 +915,13 @@ static void smp_tune_scheduling (void) cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); printk("task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); + cache_decay_ticks); } /* diff -Naurp linux-2.6.0-test2/arch/i386/kernel/timers/timer_tsc.c linux-2.6.0-test2-O13/arch/i386/kernel/timers/timer_tsc.c --- linux-2.6.0-test2/arch/i386/kernel/timers/timer_tsc.c 2003-07-11 23:36:40.000000000 +1000 +++ linux-2.6.0-test2-O13/arch/i386/kernel/timers/timer_tsc.c 2003-08-05 01:55:08.000000000 +1000 @@ -116,6 +116,24 @@ static unsigned long long monotonic_cloc return base + cycles_2_ns(this_offset - last_offset); } +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + if (unlikely(!cpu_has_tsc)) + return (unsigned long long)jiffies * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + + static void mark_offset_tsc(void) { unsigned long lost,delay; diff -Naurp linux-2.6.0-test2/arch/ia64/kernel/time.c linux-2.6.0-test2-O13/arch/ia64/kernel/time.c --- linux-2.6.0-test2/arch/ia64/kernel/time.c 2003-07-28 20:46:36.000000000 +1000 +++ linux-2.6.0-test2-O13/arch/ia64/kernel/time.c 2003-08-05 01:55:08.000000000 +1000 @@ -61,6 +61,14 @@ do_profile (unsigned long ip) atomic_inc((atomic_t *) &prof_buffer[ip]); } +unsigned long long +sched_clock (void) +{ + unsigned long offset = ia64_get_itc(); + + return (offset * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT; +} + static void itc_reset (void) { diff -Naurp linux-2.6.0-test2/fs/proc/array.c linux-2.6.0-test2-O13/fs/proc/array.c --- linux-2.6.0-test2/fs/proc/array.c 2003-05-05 09:53:32.000000000 +1000 +++ linux-2.6.0-test2-O13/fs/proc/array.c 2003-08-05 01:55:08.000000000 +1000 @@ -154,13 +154,16 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), p->tgid, + get_task_state(p), + (p->sleep_avg/1024)*100/(1000000000/1024), + p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, p->uid, p->euid, p->suid, p->fsuid, diff -Naurp linux-2.6.0-test2/include/linux/sched.h linux-2.6.0-test2-O13/include/linux/sched.h --- linux-2.6.0-test2/include/linux/sched.h 2003-08-01 12:39:03.000000000 +1000 +++ linux-2.6.0-test2-O13/include/linux/sched.h 2003-08-05 01:55:10.000000000 +1000 @@ -339,7 +339,9 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; - unsigned long last_run; + long interactive_credit; + unsigned long long timestamp; + int activated; unsigned long policy; unsigned long cpus_allowed; @@ -498,6 +500,8 @@ static inline int set_cpus_allowed(task_ } #endif +extern unsigned long long sched_clock(void); + #ifdef CONFIG_NUMA extern void sched_balance_exec(void); extern void node_nr_running_init(void); diff -Naurp linux-2.6.0-test2/kernel/fork.c linux-2.6.0-test2-O13/kernel/fork.c --- linux-2.6.0-test2/kernel/fork.c 2003-07-28 20:46:40.000000000 +1000 +++ linux-2.6.0-test2-O13/kernel/fork.c 2003-08-05 01:55:08.000000000 +1000 @@ -896,7 +896,7 @@ struct task_struct *copy_process(unsigne */ p->first_time_slice = 1; current->time_slice >>= 1; - p->last_run = jiffies; + p->timestamp = sched_clock(); if (!current->time_slice) { /* * This case is rare, it happens when the parent has only diff -Naurp linux-2.6.0-test2/kernel/sched.c linux-2.6.0-test2-O13/kernel/sched.c --- linux-2.6.0-test2/kernel/sched.c 2003-08-01 12:39:03.000000000 +1000 +++ linux-2.6.0-test2-O13/kernel/sched.c 2003-08-05 01:55:22.000000000 +1000 @@ -58,6 +58,14 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ + (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) (TIME / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) (TIME * (1000000000 / HZ)) /* * These are the 'tuning knobs' of the scheduler: @@ -68,13 +76,17 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 +#define TIMESLICE_GRANULARITY (HZ/40 ?: 1) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) +#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) #define NODE_THRESHOLD 125 /* @@ -115,6 +127,14 @@ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) +#define JUST_INTERACTIVE_SLEEP(p) \ + (MAX_SLEEP_AVG - (DELTA(p) * AVG_TIMESLICE)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio || \ + ((p)->prio == (rq)->curr->prio && \ + (p)->time_slice > (rq)->curr->time_slice * 2)) + /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -319,8 +339,9 @@ static int effective_prio(task_t *p) if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; + bonus = MAX_USER_PRIO * PRIO_BONUS_RATIO * + NS_TO_JIFFIES(p->sleep_avg) / MAX_SLEEP_AVG / 100; + bonus -= MAX_USER_PRIO * PRIO_BONUS_RATIO / 100 / 2; prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) @@ -339,6 +360,73 @@ static inline void __activate_task(task_ nr_running_inc(rq); } +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (!p->sleep_avg) + p->interactive_credit--; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && sleep_time > + JIFFIES_TO_NS(JUST_INTERACTIVE_SLEEP(p))) + p->sleep_avg = + JIFFIES_TO_NS(JUST_INTERACTIVE_SLEEP(p)); + else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. Tasks + * without interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + sleep_time *= (MAX_BONUS + 1 - + (NS_TO_JIFFIES(p->sleep_avg) * + MAX_BONUS / MAX_SLEEP_AVG)); + + if (p->interactive_credit < 0 && + sleep_time > JIFFIES_TO_NS(task_timeslice(p))) + sleep_time = + JIFFIES_TO_NS(task_timeslice(p)); + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + /* + * 'Overflow' bonus ticks go to the waker as well, so the + * ticks are not lost. This has the effect of further + * boosting tasks that are related to maximum-interactive + * tasks. + */ + if (p->sleep_avg > NS_MAX_SLEEP_AVG){ + p->sleep_avg = NS_MAX_SLEEP_AVG; + p->interactive_credit++; + } + } + } + + p->prio = effective_prio(p); +} + /* * activate_task - move a task to the runqueue and do priority recalculation * @@ -347,34 +435,34 @@ static inline void __activate_task(task_ */ static inline void activate_task(task_t *p, runqueue_t *rq) { - long sleep_time = jiffies - p->last_run - 1; + unsigned long long now = sched_clock(); - if (sleep_time > 0) { - int sleep_avg; + recalc_task_prio(p, now); + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->activated){ /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->last_run. The more time a task - * spends sleeping, the higher the average gets - and the - * higher the priority boost gets as well. + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: */ - sleep_avg = p->sleep_avg + sleep_time; - + if (in_interrupt()) + p->activated = 2; + else /* - * 'Overflow' bonus ticks go to the waker as well, so the - * ticks are not lost. This has the effect of further - * boosting tasks that are related to maximum-interactive - * tasks. + * Normal first-time wakeups get a credit too for on-runqueue + * time, but it will be weighted down: */ - if (sleep_avg > MAX_SLEEP_AVG) - sleep_avg = MAX_SLEEP_AVG; - if (p->sleep_avg != sleep_avg) { - p->sleep_avg = sleep_avg; - p->prio = effective_prio(p); - } + p->activated = 1; } + + p->timestamp = now; + __activate_task(p, rq); } @@ -495,13 +583,20 @@ repeat_lock_task: task_rq_unlock(rq, &flags); goto repeat_lock_task; } - if (old_state == TASK_UNINTERRUPTIBLE) + if (old_state == TASK_UNINTERRUPTIBLE){ + /* + * Tasks on involuntary sleep don't earn + * sleep_avg + */ rq->nr_uninterruptible--; + p->timestamp = sched_clock(); + p->activated = -1; + } if (sync) __activate_task(p, rq); else { activate_task(p, rq); - if (p->prio < rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } success = 1; @@ -541,7 +636,7 @@ int wake_up_state(task_t *p, unsigned in */ void wake_up_forked_process(task_t * p) { - unsigned long flags; + unsigned long flags, sleep_avg; runqueue_t *rq = task_rq_lock(current, &flags); p->state = TASK_RUNNING; @@ -550,8 +645,17 @@ void wake_up_forked_process(task_t * p) * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + sleep_avg = NS_TO_JIFFIES(current->sleep_avg) * MAX_BONUS / + MAX_SLEEP_AVG * PARENT_PENALTY / 100 * + MAX_SLEEP_AVG / MAX_BONUS; + current->sleep_avg = JIFFIES_TO_NS(sleep_avg); + + sleep_avg = NS_TO_JIFFIES(p->sleep_avg) * MAX_BONUS / MAX_SLEEP_AVG * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS; + p->sleep_avg = JIFFIES_TO_NS(sleep_avg); + + p->interactive_credit = 0; + p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -592,8 +696,9 @@ void sched_exit(task_t * p) * the sleep_avg of the parent as well. */ if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); } /** @@ -978,13 +1083,8 @@ static inline void pull_task(runqueue_t * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (p->prio < this_rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, this_rq)) set_need_resched(); - else { - if (p->prio == this_rq->curr->prio && - p->time_slice > this_rq->curr->time_slice) - set_need_resched(); - } } /* @@ -1001,12 +1101,14 @@ static void load_balance(runqueue_t *thi runqueue_t *busiest; prio_array_t *array; struct list_head *head, *curr; + unsigned long long now; task_t *tmp; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) goto out; + now = sched_clock(); /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1047,7 +1149,7 @@ skip_queue: */ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \ + ((!idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\ !task_running(rq, p) && \ ((p)->cpus_allowed & (1UL << (this_cpu)))) @@ -1209,14 +1311,11 @@ void scheduler_tick(int user_ticks, int spin_lock(&rq->lock); /* * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ - if (p->sleep_avg) - p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1240,12 +1339,34 @@ void scheduler_tick(int user_ticks, int p->time_slice = task_timeslice(p); p->first_time_slice = 0; + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (p->mm && !((task_timeslice(p) - p->time_slice) % + TIMESLICE_GRANULARITY) && (p->time_slice > MIN_TIMESLICE) && + (p->array == rq->active)) { + + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1264,6 +1385,8 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; + unsigned long long now; + unsigned long run_time; int idx; /* @@ -1284,7 +1407,22 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - prev->last_run = jiffies; + now = sched_clock(); + if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks with interactive credits get charged less run_time + * as their sleep_avg decreases to slow them losing their + * priority bonus + */ + if (prev->interactive_credit > 0) + run_time /= (MAX_BONUS + 1 - + (NS_TO_JIFFIES(prev->sleep_avg) * MAX_BONUS / + MAX_SLEEP_AVG)); + spin_lock_irq(&rq->lock); /* @@ -1332,12 +1470,30 @@ pick_next_task: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (next->activated > 0) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + next->activated = 0; + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg < 0) + prev->sleep_avg = 0; + prev->timestamp = now; + if (likely(prev != next)) { + next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -1577,6 +1733,7 @@ void set_user_nice(task_t *p, long nice) unsigned long flags; prio_array_t *array; runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -1585,6 +1742,12 @@ void set_user_nice(task_t *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ if (rt_task(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; @@ -1592,16 +1755,20 @@ void set_user_nice(task_t *p, long nice) array = p->array; if (array) dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio = NICE_TO_PRIO(nice); + p->prio += delta; + if (array) { enqueue_task(p, array); /* - * If the task is running and lowered its priority, - * or increased its priority then reschedule its CPU: + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) + if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -2358,6 +2525,12 @@ static void move_task_away(struct task_s local_irq_restore(flags); } +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -2367,20 +2540,21 @@ static int migration_thread(void * data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - int cpu = (long) data; + migration_startup_t *startup = data; + int cpu = startup->cpu; runqueue_t *rq; int ret; + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - /* - * Either we are running on the right CPU, or there's a a - * migration thread on this CPU, guaranteed (we're started - * serially). - */ - set_cpus_allowed(current, 1UL << cpu); - ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); @@ -2416,13 +2590,30 @@ static int migration_call(struct notifie unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + migration_startup_t startup; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) yield(); + break; } return NOTIFY_OK;