diff -Naurp linux-2.6.0-test2/arch/i386/kernel/smpboot.c linux-2.6.0-test2-A3/arch/i386/kernel/smpboot.c --- linux-2.6.0-test2/arch/i386/kernel/smpboot.c 2003-07-14 13:34:03.000000000 +1000 +++ linux-2.6.0-test2-A3/arch/i386/kernel/smpboot.c 2003-08-03 11:35:42.000000000 +1000 @@ -915,13 +915,13 @@ static void smp_tune_scheduling (void) cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); printk("task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); + cache_decay_ticks); } /* diff -Naurp linux-2.6.0-test2/arch/i386/kernel/timers/timer_tsc.c linux-2.6.0-test2-A3/arch/i386/kernel/timers/timer_tsc.c --- linux-2.6.0-test2/arch/i386/kernel/timers/timer_tsc.c 2003-07-14 13:35:57.000000000 +1000 +++ linux-2.6.0-test2-A3/arch/i386/kernel/timers/timer_tsc.c 2003-08-03 11:35:42.000000000 +1000 @@ -116,6 +116,24 @@ static unsigned long long monotonic_cloc return base + cycles_2_ns(this_offset - last_offset); } +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + if (unlikely(!cpu_has_tsc)) + return (unsigned long long)jiffies * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + + static void mark_offset_tsc(void) { unsigned long lost,delay; diff -Naurp linux-2.6.0-test2/arch/ia64/kernel/time.c linux-2.6.0-test2-A3/arch/ia64/kernel/time.c --- linux-2.6.0-test2/arch/ia64/kernel/time.c 2003-07-28 10:39:00.000000000 +1000 +++ linux-2.6.0-test2-A3/arch/ia64/kernel/time.c 2003-08-03 11:35:42.000000000 +1000 @@ -61,6 +61,14 @@ do_profile (unsigned long ip) atomic_inc((atomic_t *) &prof_buffer[ip]); } +unsigned long long +sched_clock (void) +{ + unsigned long offset = ia64_get_itc(); + + return (offset * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT; +} + static void itc_reset (void) { diff -Naurp linux-2.6.0-test2/fs/proc/array.c linux-2.6.0-test2-A3/fs/proc/array.c --- linux-2.6.0-test2/fs/proc/array.c 2003-07-14 13:35:12.000000000 +1000 +++ linux-2.6.0-test2-A3/fs/proc/array.c 2003-08-03 13:10:00.000000000 +1000 @@ -154,13 +154,16 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), p->tgid, + get_task_state(p), + (p->sleep_avg/1024)*100/(1000000000/1024), + p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, p->uid, p->euid, p->suid, p->fsuid, diff -Naurp linux-2.6.0-test2/include/linux/sched.h linux-2.6.0-test2-A3/include/linux/sched.h --- linux-2.6.0-test2/include/linux/sched.h 2003-07-28 10:39:10.000000000 +1000 +++ linux-2.6.0-test2-A3/include/linux/sched.h 2003-08-03 11:35:42.000000000 +1000 @@ -339,7 +339,8 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; - unsigned long last_run; + unsigned long long timestamp; + int activated; unsigned long policy; unsigned long cpus_allowed; @@ -498,6 +499,8 @@ static inline int set_cpus_allowed(task_ } #endif +extern unsigned long long sched_clock(void); + #ifdef CONFIG_NUMA extern void sched_balance_exec(void); extern void node_nr_running_init(void); diff -Naurp linux-2.6.0-test2/kernel/fork.c linux-2.6.0-test2-A3/kernel/fork.c --- linux-2.6.0-test2/kernel/fork.c 2003-07-28 10:39:10.000000000 +1000 +++ linux-2.6.0-test2-A3/kernel/fork.c 2003-08-03 11:35:42.000000000 +1000 @@ -896,7 +896,7 @@ struct task_struct *copy_process(unsigne */ p->first_time_slice = 1; current->time_slice >>= 1; - p->last_run = jiffies; + p->timestamp = sched_clock(); if (!current->time_slice) { /* * This case is rare, it happens when the parent has only diff -Naurp linux-2.6.0-test2/kernel/sched.c linux-2.6.0-test2-A3/kernel/sched.c --- linux-2.6.0-test2/kernel/sched.c 2003-07-28 10:39:10.000000000 +1000 +++ linux-2.6.0-test2-A3/kernel/sched.c 2003-08-03 11:37:33.000000000 +1000 @@ -68,13 +68,15 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 +#define TIMESLICE_GRANULARITY (HZ/40 ?: 1) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) +#define MAX_SLEEP_AVG (1*1000000000) +#define STARVATION_LIMIT HZ #define NODE_THRESHOLD 125 /* @@ -115,6 +117,11 @@ #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio || \ + ((p)->prio == (rq)->curr->prio && \ + (p)->time_slice > (rq)->curr->time_slice * 2)) + /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -319,8 +326,8 @@ static int effective_prio(task_t *p) if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*(p->sleep_avg/1024)/(MAX_SLEEP_AVG/1024)/100; + bonus -= MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) @@ -339,24 +346,24 @@ static inline void __activate_task(task_ nr_running_inc(rq); } -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) - */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void recalc_task_prio(task_t *p, unsigned long long now) { - long sleep_time = jiffies - p->last_run - 1; + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > MAX_SLEEP_AVG) + sleep_time = MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; if (sleep_time > 0) { - int sleep_avg; + unsigned long long sleep_avg; /* * This code gives a bonus to interactive tasks. * * The boost works by updating the 'average sleep time' - * value here, based on ->last_run. The more time a task + * value here, based on ->timestamp. The more time a task * spends sleeping, the higher the average gets - and the * higher the priority boost gets as well. */ @@ -375,6 +382,37 @@ static inline void activate_task(task_t p->prio = effective_prio(p); } } +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long long now = sched_clock(); + + recalc_task_prio(p, now); + + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else + /* + * Normal first-time wakeups get a credit too for on-runqueue time, + * but it will be weighted down: + */ + p->activated = 1; + p->timestamp = now; + __activate_task(p, rq); } @@ -501,7 +539,7 @@ repeat_lock_task: __activate_task(p, rq); else { activate_task(p, rq); - if (p->prio < rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } success = 1; @@ -550,8 +588,8 @@ void wake_up_forked_process(task_t * p) * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + current->sleep_avg = current->sleep_avg / 100 * PARENT_PENALTY; + p->sleep_avg = p->sleep_avg / 100 * CHILD_PENALTY; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -592,8 +630,7 @@ void sched_exit(task_t * p) * the sleep_avg of the parent as well. */ if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); + p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / (EXIT_WEIGHT + 1); } /** @@ -978,13 +1015,8 @@ static inline void pull_task(runqueue_t * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (p->prio < this_rq->curr->prio) + if (TASK_PREEMPTS_CURR(p, this_rq)) set_need_resched(); - else { - if (p->prio == this_rq->curr->prio && - p->time_slice > this_rq->curr->time_slice) - set_need_resched(); - } } /* @@ -1001,12 +1033,14 @@ static void load_balance(runqueue_t *thi runqueue_t *busiest; prio_array_t *array; struct list_head *head, *curr; + unsigned long long now; task_t *tmp; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) goto out; + now = sched_clock(); /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1047,7 +1081,7 @@ skip_queue: */ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \ + ((!idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\ !task_running(rq, p) && \ ((p)->cpus_allowed & (1UL << (this_cpu)))) @@ -1164,8 +1198,7 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ #define EXPIRED_STARVING(rq) \ (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) + (jiffies - (rq)->expired_timestamp >= STARVATION_LIMIT))) /* * This function gets called by the timer code, with HZ frequency. @@ -1209,14 +1242,11 @@ void scheduler_tick(int user_ticks, int spin_lock(&rq->lock); /* * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ - if (p->sleep_avg) - p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1240,12 +1270,33 @@ void scheduler_tick(int user_ticks, int p->time_slice = task_timeslice(p); p->first_time_slice = 0; + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (!((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1264,6 +1315,8 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; + unsigned long long now; + unsigned long run_time; int idx; /* @@ -1284,7 +1337,11 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - prev->last_run = jiffies; + now = sched_clock(); + if (likely(now - prev->timestamp < MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = MAX_SLEEP_AVG; spin_lock_irq(&rq->lock); /* @@ -1332,12 +1389,30 @@ pick_next_task: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (next->activated) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + next->activated = 0; + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg < 0) + prev->sleep_avg = 0; + prev->timestamp = now; + if (likely(prev != next)) { + next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -1577,6 +1652,7 @@ void set_user_nice(task_t *p, long nice) unsigned long flags; prio_array_t *array; runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -1585,6 +1661,12 @@ void set_user_nice(task_t *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ if (rt_task(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; @@ -1592,16 +1674,20 @@ void set_user_nice(task_t *p, long nice) array = p->array; if (array) dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio = NICE_TO_PRIO(nice); + p->prio += delta; + if (array) { enqueue_task(p, array); /* - * If the task is running and lowered its priority, - * or increased its priority then reschedule its CPU: + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) + if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -2358,6 +2444,12 @@ static void move_task_away(struct task_s local_irq_restore(flags); } +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -2367,20 +2459,21 @@ static int migration_thread(void * data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - int cpu = (long) data; + migration_startup_t *startup = data; + int cpu = startup->cpu; runqueue_t *rq; int ret; + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - /* - * Either we are running on the right CPU, or there's a a - * migration thread on this CPU, guaranteed (we're started - * serially). - */ - set_cpus_allowed(current, 1UL << cpu); - ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); @@ -2416,13 +2509,30 @@ static int migration_call(struct notifie unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + migration_startup_t startup; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) yield(); + break; } return NOTIFY_OK;