Index: linux-2.6.11-ck7/kernel/sched.c
===================================================================
--- linux-2.6.11-ck7.orig/kernel/sched.c	2005-05-01 10:36:06.000000000 +1000
+++ linux-2.6.11-ck7/kernel/sched.c	2005-05-01 10:36:06.000000000 +1000
@@ -16,9 +16,9 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2005-03-09	New staircase scheduling policy by Con Kolivas with help
+ *  2005-04-20	New staircase scheduling policy by Con Kolivas with help
  *		from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
- *		Staircase v10.6
+ *		Staircase v11
  */
 
 #include <linux/mm.h>
@@ -122,6 +122,7 @@ struct runqueue {
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
+	unsigned long systime_centile;
 
 	unsigned long long timestamp_last_tick;
 	unsigned int cache_ticks, preempted;
@@ -333,7 +334,7 @@ struct file_operations proc_schedstat_op
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
-static runqueue_t *this_rq_lock(void)
+static inline runqueue_t *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	runqueue_t *rq;
@@ -473,16 +474,16 @@ static inline void sched_info_switch(tas
 static inline unsigned long ns_diff(unsigned long long v1, unsigned long long v2)
 {
 	unsigned long long vdiff;
-	if (unlikely(v1 < v2))
+	if (likely(v1 > v2)) {
+		vdiff = v1 - v2;
+		if (vdiff > (1 << 31))
+			vdiff = 1 << 31;
+	} else
 		/*
-		 * Rarely the clock goes backwards. There should always be
-		 * a positive difference so return 1.
+		 * Rarely the clock appears to go backwards. There should
+		 * always be a positive difference so return 1.
 		 */
 		vdiff = 1;
-	else
-		vdiff = v1 - v2;
-	if (vdiff > (1 << 31))
-		vdiff = 1 << 31;
 	return (unsigned long)vdiff;
 }
 
@@ -512,12 +513,12 @@ static inline void enqueue_task(struct t
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
-static void requeue_task(struct task_struct *p, runqueue_t *rq)
+static inline void requeue_task(struct task_struct *p, runqueue_t *rq)
 {
 	list_move_tail(&p->run_list, rq->queue + p->prio);
 }
 
-static void enqueue_task_head(struct task_struct *p, runqueue_t *rq)
+static inline void enqueue_task_head(struct task_struct *p, runqueue_t *rq)
 {
 	list_add(&p->run_list, rq->queue + p->prio);
 	__set_bit(p->prio, rq->bitmap);
@@ -545,7 +546,7 @@ static inline void __activate_idle_task(
  * burst - extra intervals an interactive task can run for at best priority
  * instead of descending priorities.
  */
-static unsigned int burst(task_t *p)
+static inline unsigned int burst(task_t *p)
 {
 	unsigned int burst = p->burst;
 
@@ -575,7 +576,7 @@ static void dec_burst(task_t *p)
 		p->burst--;
 }
 
-static unsigned int rr_interval(task_t * p)
+static inline unsigned int rr_interval(task_t * p)
 {
 	unsigned int rr_interval = RR_INTERVAL();
 	int nice = TASK_NICE(p);
@@ -674,13 +675,16 @@ static void continue_slice(task_t *p)
  * slice instead of starting a new one at high priority.
  */
 static inline void recalc_task_prio(task_t *p, unsigned long long now,
-	unsigned long rq_load)
+	unsigned long rq_systime, unsigned long rq_running)
 {
-	unsigned long sleep_time;
-	if (rq_load > 31)
-		rq_load = 31;
-	sleep_time = ns_diff(now, p->timestamp) /
-		(1 << rq_load);
+	unsigned long sleep_time = ns_diff(now, p->timestamp);
+
+	/*
+	 * Priority is elevated back to best by amount of sleep_time.
+	 * sleep_time is scaled down by in-kernel system time and by
+	 * number of tasks currently running.
+	 */
+	sleep_time = sleep_time * (100 - rq_systime) / 200 / (rq_running + 1);
 
 	p->totalrun += p->runtime;
 	if (NS_TO_JIFFIES(p->totalrun) >= p->slice &&
@@ -735,7 +739,7 @@ static void activate_task(task_t *p, run
 #endif
 	p->slice = slice(p);
 	p->time_slice = rr_interval(p);
-	recalc_task_prio(p, now, rq->nr_running);
+	recalc_task_prio(p, now, rq->systime_centile / 100, rq->nr_running);
 	p->flags &= ~PF_UISLEEP;
 	p->prio = effective_prio(p);
 	p->timestamp = now;
@@ -924,7 +928,7 @@ static inline unsigned long target_load(
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, task_t *p)
+static inline int wake_idle(int cpu, task_t *p)
 {
 	cpumask_t tmp;
 	struct sched_domain *sd;
@@ -965,12 +969,8 @@ static int cache_delay = 10 * HZ / 1000;
  */
 static void preempt(task_t *p, runqueue_t *rq)
 {
-	if (p->prio > rq->curr->prio)
+	if (p->prio >= rq->curr->prio)
 		return;
-	if (p->prio == rq->curr->prio &&
-		((p->totalrun || p->slice != slice(p)) ||
-		rt_task(rq->curr)))
-			return;
 	if (!sched_compute || rq->cache_ticks >= cache_delay ||
 		!p->mm || rt_task(p))
 			resched_task(rq->curr);
@@ -1477,7 +1477,7 @@ static int find_idlest_cpu(struct task_s
  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
  * the cpu_allowed mask is restored.
  */
-static void sched_migrate_task(task_t *p, int dest_cpu)
+static inline void sched_migrate_task(task_t *p, int dest_cpu)
 {
 	migration_req_t req;
 	runqueue_t *rq;
@@ -1543,7 +1543,7 @@ out:
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(runqueue_t *src_rq, task_t *p,
+static inline void pull_task(runqueue_t *src_rq, task_t *p,
 		runqueue_t *this_rq, int this_cpu)
 {
 	dequeue_task(p, src_rq);
@@ -1563,7 +1563,7 @@ static void pull_task(runqueue_t *src_rq
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
-static int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
+static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 		     struct sched_domain *sd, enum idle_type idle)
 {
 	/*
@@ -1821,7 +1821,7 @@ static runqueue_t *find_busiest_queue(st
  *
  * Called with this_rq unlocked.
  */
-static int load_balance(int this_cpu, runqueue_t *this_rq,
+static inline int load_balance(int this_cpu, runqueue_t *this_rq,
 			struct sched_domain *sd, enum idle_type idle)
 {
 	struct sched_group *group;
@@ -1927,7 +1927,7 @@ out_balanced:
  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
  * this_rq is locked.
  */
-static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 				struct sched_domain *sd)
 {
 	struct sched_group *group;
@@ -1967,7 +1967,7 @@ out:
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-static void idle_balance(int this_cpu, runqueue_t *this_rq)
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *sd;
 
@@ -1989,7 +1989,7 @@ static void idle_balance(int this_cpu, r
  *
  * Called with busiest_rq locked.
  */
-static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+static inline void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
 	struct sched_domain *sd;
 	struct sched_group *cpu_group;
@@ -2111,7 +2111,7 @@ static inline void idle_balance(int cpu,
 }
 #endif
 
-static int wake_priority_sleeper(runqueue_t *rq)
+static inline int wake_priority_sleeper(runqueue_t *rq)
 {
 	int ret = 0;
 #ifdef CONFIG_SCHED_SMT
@@ -2256,6 +2256,9 @@ void account_system_time(struct task_str
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+
+	/* For calculating rolling percentage of sys time per runqueue */
+	rq->systime_centile += cputime * 100;
 }
 
 /*
@@ -2301,6 +2304,9 @@ void scheduler_tick(void)
 
 	rq->timestamp_last_tick = sched_clock();
 
+	/* Rolling percentage systime per runqueue */
+	rq->systime_centile = rq->systime_centile * 99 / 100;
+
 	if (p == rq->idle) {
 		if (wake_priority_sleeper(rq))
 			goto out;
@@ -3449,7 +3455,7 @@ out_unlock:
 	return retval;
 }
 
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+static inline int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     cpumask_t *new_mask)
 {
 	if (len < sizeof(cpumask_t)) {
@@ -3798,7 +3804,7 @@ static inline struct task_struct *younge
 	return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static void show_task(task_t * p)
+static inline void show_task(task_t * p)
 {
 	task_t *relative;
 	unsigned state;
