Index: linux-2.6.9-rc1-mm1/fs/proc/array.c
===================================================================
--- linux-2.6.9-rc1-mm1.orig/fs/proc/array.c	2004-08-26 22:27:37.000000000 +1000
+++ linux-2.6.9-rc1-mm1/fs/proc/array.c	2004-08-26 22:46:26.000000000 +1000
@@ -159,8 +159,7 @@ static inline char * task_state(struct t
 	read_lock(&tasklist_lock);
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
-		"sleep_time:\t%lu\n"
-		"total_time:\t%lu\n"
+		"Burst:\t%d\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -168,7 +167,7 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		p->sleep_time, p->total_time,
+		p->burst,
 	       	p->tgid,
 		p->pid, p->pid ? p->real_parent->pid : 0,
 		p->pid && p->ptrace ? p->parent->pid : 0,
Index: linux-2.6.9-rc1-mm1/include/linux/init_task.h
===================================================================
--- linux-2.6.9-rc1-mm1.orig/include/linux/init_task.h	2004-08-26 22:27:38.000000000 +1000
+++ linux-2.6.9-rc1-mm1/include/linux/init_task.h	2004-08-26 22:31:29.000000000 +1000
@@ -73,13 +73,14 @@ extern struct group_info init_groups;
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
-	.prio		= MAX_PRIO-29,					\
-	.static_prio	= MAX_PRIO-29,					\
+	.prio		= MAX_PRIO-20,					\
+	.static_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
Index: linux-2.6.9-rc1-mm1/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc1-mm1.orig/include/linux/sched.h	2004-08-26 22:27:38.000000000 +1000
+++ linux-2.6.9-rc1-mm1/include/linux/sched.h	2004-08-26 22:46:26.000000000 +1000
@@ -164,6 +164,7 @@ extern void show_stack(struct task_struc
 
 void io_schedule(void);
 long io_schedule_timeout(long timeout);
+extern int sched_interactive, sched_compute;
 
 extern void cpu_init (void);
 extern void trap_init(void);
@@ -330,7 +331,7 @@ struct signal_struct {
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
-#define MAX_PRIO		(MAX_RT_PRIO + 59)
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
 
 #define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
 
@@ -361,7 +362,6 @@ extern struct user_struct *find_user(uid
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
-typedef struct prio_array prio_array_t;
 struct backing_dev_info;
 struct reclaim_state;
 
@@ -451,17 +451,13 @@ struct task_struct {
 
 	int prio, static_prio;
 	struct list_head run_list;
-	prio_array_t *array;
-
-	/* Scheduler variables follow. kernel/sched.c */
-	unsigned long array_sequence;
 	unsigned long long timestamp;
-	int used_slice;
-
-	unsigned long total_time, sleep_time;
+	unsigned long runtime, totalrun;
+	unsigned int burst;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
+	unsigned int slice, time_slice;
 
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
@@ -641,6 +637,9 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
+#define PF_FORKED	0x00400000	/* I have just forked */
+#define PF_YIELDED	0x00800000	/* I have just yielded */
+#define PF_UISLEEP	0x01000000	/* Uninterruptible sleep */
 
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
@@ -724,7 +723,6 @@ extern void FASTCALL(wake_up_new_task(st
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
 extern void FASTCALL(sched_fork(task_t * p));
-extern void FASTCALL(sched_exit(task_t * p));
 
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
Index: linux-2.6.9-rc1-mm1/include/linux/sysctl.h
===================================================================
--- linux-2.6.9-rc1-mm1.orig/include/linux/sysctl.h	2004-08-26 22:27:38.000000000 +1000
+++ linux-2.6.9-rc1-mm1/include/linux/sysctl.h	2004-08-26 22:46:26.000000000 +1000
@@ -134,7 +134,8 @@ enum
 	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
 	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
-	KERN_SCHED_TIMESLICE=67, /* int: base timeslice for scheduler */
+	KERN_INTERACTIVE=67,	/* interactive tasks can have cpu bursts */
+	KERN_COMPUTE=68,	/* adjust timeslices for a compute server */
 };
 
 
Index: linux-2.6.9-rc1-mm1/init/main.c
===================================================================
--- linux-2.6.9-rc1-mm1.orig/init/main.c	2004-08-26 22:38:56.000000000 +1000
+++ linux-2.6.9-rc1-mm1/init/main.c	2004-08-26 22:46:26.000000000 +1000
@@ -684,6 +684,7 @@ static inline void fixup_cpu_present_map
 static int init(void * unused)
 {
 	lock_kernel();
+	current->prio = MAX_PRIO - 1;
 	/*
 	 * Tell the world that we're going to be the grim
 	 * reaper of innocent orphaned children.
Index: linux-2.6.9-rc1-mm1/kernel/exit.c
===================================================================
--- linux-2.6.9-rc1-mm1.orig/kernel/exit.c	2004-08-26 22:38:56.000000000 +1000
+++ linux-2.6.9-rc1-mm1/kernel/exit.c	2004-08-26 22:46:26.000000000 +1000
@@ -93,7 +93,6 @@ repeat: 
 	}
 
 	perfctr_release_task(p);
-	sched_exit(p);
 	write_unlock_irq(&tasklist_lock);
 	spin_unlock(&p->proc_lock);
 	proc_pid_flush(proc_dentry);
Index: linux-2.6.9-rc1-mm1/kernel/sched.c
===================================================================
--- linux-2.6.9-rc1-mm1.orig/kernel/sched.c	2004-08-26 22:27:38.000000000 +1000
+++ linux-2.6.9-rc1-mm1/kernel/sched.c	2004-08-26 22:46:26.000000000 +1000
@@ -16,6 +16,8 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-07-07	New staircase scheduling policy by Con Kolivas with help
+ *		from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
  */
 
 #include <linux/mm.h>
@@ -54,68 +56,32 @@
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
-#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 30)
-#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 30)
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 58 ] range.
+ * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
-#define US_TO_JIFFIES(x)	((x) * HZ / 1000000)
-#define JIFFIES_TO_US(x)	((x) * 1000000 / HZ)
-
 /*
- * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there
- * is a maximum priority process runnable. MAX_TIMESLICE is derived from the
- * formula in task_timeslice. It cannot be changed here. It is the timesilce
- * that the maximum priority process will get. Larger timeslices are attainable
- * by low priority processes however.
+ * Some helpers for converting nanosecond timing to jiffy resolution
  */
-int sched_base_timeslice = 64;
-int sched_min_base = 1;
-int sched_max_base = 10000;
-
-#define RT_TIMESLICE		(50 * 1000 / HZ)     /* 50ms */
-#define BASE_TIMESLICE		(sched_base_timeslice)
-#define MIN_TIMESLICE		1
-
-/* Maximum amount of history that will be used to calculate priority */
-#define MAX_SLEEP_SHIFT		19
-#define MAX_SLEEP		(1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 
+int sched_compute = 0;
 /*
- * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is
- * will moderate dicard freak events (eg. SIGSTOP)
+ *This is the time all tasks within the same priority round robin.
+ *compute setting is reserved for dedicated computational scheduling
+ *and has ten times larger intervals.
  */
-#define MAX_SLEEP_AFFECT	(MAX_SLEEP/4)
-
-/*
- * The amount of history can be decreased (on fork for example). This puts a
- * lower bound on it.
- */
-#define MIN_HISTORY		(MAX_SLEEP/8)
-
-#define FORKED_TS_MAX		(US_TO_JIFFIES(MIN_HISTORY) ?: 1)
-
-/*
- * SLEEP_FACTOR is a fixed point factor used to scale history tracking things.
- * In particular: total_time, sleep_time, sleep_avg.
- */
-#define SLEEP_FACTOR		1024
-
-/*
- * The scheduler classifies a process as performing one of the following
- * activities
- */
-#define STIME_SLEEP		1	/* Sleeping */
-#define STIME_RUN		2	/* Using CPU */
-
-#define TASK_PREEMPTS_CURR(p, rq)	( (p)->prio < (rq)->curr->prio )
+#define _RR_INTERVAL		((10 * HZ / 1000) ? : 1)
+#define RR_INTERVAL()		(_RR_INTERVAL * (1 + 9 * sched_compute))
 
 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
 
@@ -133,17 +99,8 @@ struct sched_domain;
  * These are the runqueue data structures:
  */
 
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
 typedef struct runqueue runqueue_t;
 
-struct prio_array {
-	int min_prio;
-	unsigned int nr_active;
-	unsigned long bitmap[BITMAP_SIZE];
-	struct list_head queue[MAX_PRIO];
-};
-
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -162,17 +119,17 @@ struct runqueue {
 #ifdef CONFIG_SMP
 	unsigned long cpu_load;
 #endif
-	unsigned long array_sequence;
-	unsigned long nr_uninterruptible;
 	unsigned long long nr_switches;
+	unsigned long nr_uninterruptible;
+	unsigned long long timestamp_last_tick;
+	unsigned int cache_ticks, preempted;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
+	unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO+1)];
+	struct list_head queue[MAX_PRIO + 1];
 	atomic_t nr_iowait;
-	prio_array_t *active, *expired, arrays[2];
 
 #ifdef CONFIG_SMP
-	unsigned long long timestamp_last_tick;
-
 	struct sched_domain *sd;
 
 	/* For active balancing */
@@ -326,7 +283,7 @@ struct sched_domain {
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (5*1000/2),		\
+	.cache_hot_time		= (5*1000000/2),	\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
@@ -348,7 +305,7 @@ struct sched_domain {
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000),		\
+	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_EXEC	\
@@ -502,6 +459,20 @@ struct file_operations proc_schedstat_op
 # define schedstat_add(rq, field, amt)	do { } while (0);
 #endif
 
+/*
+ * rq_lock - lock a given runqueue and disable interrupts.
+ */
+static runqueue_t *this_rq_lock(void)
+{
+	runqueue_t *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	spin_lock(&rq->lock);
+
+	return rq;
+}
+
 static inline void rq_unlock(runqueue_t *rq)
 {
 	spin_unlock_irq(&rq->lock);
@@ -613,34 +584,26 @@ static inline void sched_info_switch(tas
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
+static inline int task_queued(task_t *task)
+{
+	return !list_empty(&task->run_list);
+}
+
 /*
- * Adding/removing a task to/from a priority array:
+ * Adding/removing a task to/from a runqueue:
  */
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, runqueue_t *rq)
 {
-	array->nr_active--;
-	list_del(&p->run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
+	list_del_init(&p->run_list);
+	if (list_empty(rq->queue + p->prio))
+		__clear_bit(p->prio, rq->bitmap);
 }
 
-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, runqueue_t *rq)
 {
-	struct list_head *entry = array->queue + p->prio;
 	sched_info_queued(p);
-
-	if (!rt_task(p)) {
-		/*
-		 * Cycle tasks on the same priority level. This reduces their
-		 * timeslice fluctuations due to higher priority tasks expiring.
-		 */
-		if (!list_empty(entry))
-			entry = entry->next;
-	}
-	list_add_tail(&p->run_list, entry);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
-	p->array = array;
+	list_add_tail(&p->run_list, rq->queue + p->prio);
+	__set_bit(p->prio, rq->bitmap);
 }
 
 /*
@@ -648,153 +611,158 @@ static void enqueue_task(struct task_str
  * remote queue so we want these tasks to show up at the head of the
  * local queue:
  */
-static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+static inline void enqueue_task_head(struct task_struct *p, runqueue_t *rq)
 {
-	list_add(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
-	p->array = array;
+	list_add(&p->run_list, rq->queue + p->prio);
+	__set_bit(p->prio, rq->bitmap);
 }
 
-static inline unsigned long long clock_us(void)
+/*
+ * __activate_task - move a task to the runqueue.
+ */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
-	return sched_clock() >> 10;
+	enqueue_task(p, rq);
+	rq->nr_running++;
 }
 
 /*
- * add_task_time updates a task @p after @time of doing the specified @type
- * of activity. See STIME_*. This is used for priority calculation.
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
  */
-static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type)
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
-	unsigned long ratio;
-	unsigned long long tmp;
-	unsigned long t;
-
-	if (type == STIME_SLEEP) {
-		if (time > MAX_SLEEP_AFFECT*4)
-			time = MAX_SLEEP_AFFECT*4;
-		t = ((unsigned long)time + 3) / 4;
-	} else {
-		unsigned long div = 60 - USER_PRIO(p->static_prio);
-		t = (unsigned long)time * 30;
-		t = t / div;
-		t = t * 30;
-		t = t / div;
-	}
-
-	ratio = MAX_SLEEP - t;
-	tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2;
-	tmp >>= MAX_SLEEP_SHIFT;
-	p->total_time = (unsigned long)tmp;
+	enqueue_task_head(p, rq);
+	rq->nr_running++;
+}
 
-	tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2;
-	tmp >>= MAX_SLEEP_SHIFT;
-	p->sleep_time = (unsigned long)tmp;
+/*
+ * burst - extra intervals an interactive task can run for at best priority
+ * instead of descending priorities.
+ */
+static unsigned int burst(task_t *p)
+{
+	if (likely(!rt_task(p))) {
+		unsigned int task_user_prio = TASK_USER_PRIO(p);
+		return 39 - task_user_prio;
+	} else
+		return p->burst;
+}
 
-	p->total_time += t;
-	if (type == STIME_SLEEP)
-		p->sleep_time += t;
+static void inc_burst(task_t *p)
+{
+	unsigned int best_burst;
+	best_burst = burst(p);
+	if (p->burst < best_burst)
+		p->burst++;
 }
 
-static unsigned long task_sleep_avg(task_t *p)
+static void dec_burst(task_t *p)
 {
-	return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1);
+	if (p->burst)
+		p->burst--;
 }
 
 /*
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * Timeslices are scaled, so if only low priority processes are running,
- * they will all get long timeslices.
+ * slice - the duration a task runs before getting requeued at it's best
+ * priority and has it's burst decremented.
  */
-static int task_timeslice(task_t *p, runqueue_t *rq)
+static unsigned int slice(task_t *p)
 {
-	int idx, base, delta;
-	int timeslice;
-
-	if (rt_task(p))
-		return RT_TIMESLICE;
-
-	idx = min(p->prio, rq->expired->min_prio);
-	delta = p->prio - idx;
-	base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2);
-
-	base = base * 40 / (70 - USER_PRIO(idx));
-	base = base * 40 / (70 - USER_PRIO(idx));
+	unsigned int slice = RR_INTERVAL();
+	if (likely(!rt_task(p)))
+		slice += burst(p) * RR_INTERVAL();
+	return slice;
+}
 
-	timeslice = base * 1000 / HZ;
-	timeslice >>= 5;
-	if (timeslice < MIN_TIMESLICE)
-		timeslice = MIN_TIMESLICE;
+/*
+ * sched_interactive - sysctl which allows interactive tasks to have bursts
+ */
+int sched_interactive = 1;
 
-	return timeslice;
+static int rr_interval(task_t * p)
+{
+	int rr_interval = RR_INTERVAL();
+	if (unlikely(!rr_interval))
+		rr_interval = 1;
+	return rr_interval;
 }
 
 /*
- * task_priority: calculates a task's priority based on previous running
- * history (see add_task_time). The priority is just a simple linear function
- * based on sleep_avg and static_prio.
+ * effective_prio - dynamic priority dependent on burst.
+ * The priority normally decreases by one each RR_INTERVAL.
+ * As the burst increases the priority stays at the top "stair" or
+ * priority for longer.
  */
-static int task_priority(task_t *p)
+static int effective_prio(task_t *p)
 {
-	unsigned long sleep_avg;
-	int bonus, prio;
-
+	int prio, rr;
+	unsigned int full_slice, used_slice, first_slice;
+	unsigned int best_burst;
 	if (rt_task(p))
 		return p->prio;
 
-	sleep_avg = task_sleep_avg(p);
-
-	prio = USER_PRIO(p->static_prio) + 10;
-	bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2))
-					/ SLEEP_FACTOR;
-	prio = MAX_RT_PRIO + prio - bonus;
-
-	if (prio < MAX_RT_PRIO)
-		return MAX_RT_PRIO;
-	if (prio > MAX_PRIO-1)
-		return MAX_PRIO-1;
-
+	best_burst = burst(p);
+	full_slice = slice(p);
+	rr = rr_interval(p);
+	used_slice = full_slice - p->slice;
+	if (p->burst > best_burst)
+		p->burst = best_burst;
+	first_slice = rr;
+	if (sched_interactive && !sched_compute && p->mm)
+		first_slice *= (p->burst + 1);
+	prio = MAX_PRIO - 1 - best_burst;
+
+	if (used_slice < first_slice)
+		return prio;
+	prio += 1 + (used_slice - first_slice) / rr;
+	if (prio > MAX_PRIO - 1)
+		prio = MAX_PRIO - 1;
 	return prio;
 }
 
 /*
- * __activate_task - move a task to the runqueue.
- */
-static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array)
-{
-	enqueue_task(p, array);
-	rq->nr_running++;
-	if (!rt_task(p)) {
-		if (p->prio < array->min_prio)
-			array->min_prio = p->prio;
+ * recalc_task_prio - this checks for tasks that run ultra short timeslices
+ * or have just forked a thread/process and make them continue their old
+ * slice instead of starting a new one at high priority.
+ */
+static void recalc_task_prio(task_t *p, unsigned long long now)
+{
+	unsigned long sleep_time = now - p->timestamp;
+	unsigned int rr = rr_interval(p);
+	unsigned int minrun = rr * (p->burst + 1) / (burst(p) + 1) ? : 1;
+	if (p->flags & PF_FORKED || 
+		(NS_TO_JIFFIES(p->runtime + sleep_time) < minrun || 
+		((!sched_interactive || sched_compute) && 
+		NS_TO_JIFFIES(p->runtime + sleep_time) < rr))) {
+			unsigned long ns_totalrun = p->totalrun + p->runtime;
+			unsigned long total_run = NS_TO_JIFFIES(ns_totalrun);
+			p->flags &= ~PF_FORKED;
+			if (p->slice - total_run < 1) {
+				p->totalrun = 0;
+				dec_burst(p);
+			} else {
+				unsigned int intervals = total_run / rr;
+				p->totalrun = ns_totalrun;
+				p->slice -= intervals * rr;
+				if (p->slice <= rr) {
+					p->totalrun = 0;
+					dec_burst(p);
+				}
+			}
+	} else {
+		if (!(p->flags & PF_UISLEEP))
+			inc_burst(p);
+		p->runtime = 0;
+		p->totalrun = 0;
 	}
 }
 
 /*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
-{
-	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
-}
-
-/*
  * activate_task - move a task to the runqueue and do priority recalculation
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
  */
 static void activate_task(task_t *p, runqueue_t *rq, int local)
 {
-	unsigned long long now, sleep;
-	prio_array_t *array;
-
-	now = clock_us();
+	unsigned long long now = sched_clock();
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
@@ -803,36 +771,26 @@ static void activate_task(task_t *p, run
 			+ rq->timestamp_last_tick;
 	}
 #endif
-	/*
-	 * If we have slept through an active/expired array switch, restart
-	 * our timeslice too.
-	 */
-
-	sleep = now - p->timestamp;
+	p->slice = slice(p);
+	recalc_task_prio(p, now);
+	p->flags &= ~PF_UISLEEP;
+	p->prio = effective_prio(p);
+	p->time_slice = rr_interval(p);
 	p->timestamp = now;
-	add_task_time(p, sleep, STIME_SLEEP);
-	p->prio = task_priority(p);
-
-	array = rq->active;
-	if (unlikely(p->used_slice == -1)) {
-		/* This only applys to newly woken children */
-		array = rq->expired;
-		p->used_slice = 0;
-	} else if (rq->array_sequence != p->array_sequence)
-		p->used_slice = 0;
-
-	__activate_task(p, rq, array);
+	__activate_task(p, rq);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
+static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	p->array_sequence = rq->array_sequence;
 	rq->nr_running--;
-	dequeue_task(p, p->array);
-	p->array = NULL;
+	if (p->state == TASK_UNINTERRUPTIBLE) {
+		p->flags |= PF_UISLEEP;
+		rq->nr_uninterruptible++;
+	}
+	dequeue_task(p, rq);
 }
 
 /*
@@ -847,8 +805,7 @@ static void resched_task(task_t *p)
 {
 	int need_resched, nrpolling;
 
-	BUG_ON(!spin_is_locked(&task_rq(p)->lock));
-
+	preempt_disable();
 	/* minimise the chance of sending an interrupt to poll_idle() */
 	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
@@ -856,6 +813,7 @@ static void resched_task(task_t *p)
 
 	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 		smp_send_reschedule(task_cpu(p));
+	preempt_enable();
 }
 #else
 static inline void resched_task(task_t *p)
@@ -905,7 +863,7 @@ static int migrate_task(task_t *p, int d
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->array && !task_running(rq, p)) {
+	if (!task_queued(p) && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
@@ -936,7 +894,7 @@ void wait_task_inactive(task_t * p)
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
-	if (unlikely(p->array)) {
+	if (unlikely(task_queued(p))) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);
@@ -1034,6 +992,26 @@ static inline int wake_idle(int cpu, tas
 }
 #endif
 
+/*
+ * cache_delay is the time preemption is delayed in sched_compute mode
+ * and is set to 5*cache_decay_ticks
+ */
+static int cache_delay = 10 * HZ / 1000;
+
+static int task_preempts_curr(struct task_struct *p, runqueue_t *rq)
+{
+	if (p->prio > rq->curr->prio)
+		return 0;
+	if (p->prio == rq->curr->prio && (p->totalrun ||
+		rt_task(rq->curr)))
+			return 0;
+	if (!sched_compute || rq->cache_ticks >= cache_delay ||
+		!p->mm || rt_task(p))
+			return 1;
+	rq->preempted = 1;
+		return 0;
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -1066,7 +1044,7 @@ static int try_to_wake_up(task_t * p, un
 	if (!(old_state & state))
 		goto out;
 
-	if (p->array)
+	if (task_queued(p))
 		goto out_running;
 
 	cpu = task_cpu(p);
@@ -1145,7 +1123,7 @@ out_set_cpu:
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
-		if (p->array)
+		if (task_queued(p))
 			goto out_running;
 
 		this_cpu = smp_processor_id();
@@ -1156,12 +1134,20 @@ out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
+
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption, if the woken up task will run on
+	 * this cpu. (in this case the 'I will reschedule' promise of
+	 * the waker guarantees that the freshly woken up task is going
+	 * to be considered on this CPU.)
+	 */
 	activate_task(p, rq, cpu == this_cpu);
 	if (!sync || cpu != this_cpu) {
-		if (TASK_PREEMPTS_CURR(p, rq))
+		if (task_preempts_curr(p, rq))
 			resched_task(rq->curr);
 	}
-
 	success = 1;
 
 out_running:
@@ -1175,7 +1161,7 @@ out:
 int fastcall wake_up_process(task_t * p)
 {
 	return try_to_wake_up(p, TASK_STOPPED |
-				TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+		       		 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 
 EXPORT_SYMBOL(wake_up_process);
@@ -1196,9 +1182,6 @@ static int find_idlest_cpu(struct task_s
  */
 void fastcall sched_fork(task_t *p)
 {
-	unsigned long sleep_avg;
-	runqueue_t *rq;
-
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -1207,7 +1190,6 @@ void fastcall sched_fork(task_t *p)
 	 */
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
-	p->array = NULL;
 	spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -1221,42 +1203,6 @@ void fastcall sched_fork(task_t *p)
 	 */
 	p->thread_info->preempt_count = 1;
 #endif
-
-	preempt_disable();
-	rq = this_rq();
-
-	/* XXX */
-	if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) {
-		static int warned = 0;
-		if (!warned) {
-			printk(KERN_INFO "Renicing %s for you\n", p->comm);
-			warned = 1;
-		}
-		p->static_prio = NICE_TO_PRIO(-10);
-	}
-
-	/* Get MIN_HISTORY of history with the same sleep_avg as parent. */
-	sleep_avg = task_sleep_avg(current);
-	p->total_time = MIN_HISTORY;
-	p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR;
-
-	/* Parent loses 1/4 of sleep time for forking */
-	current->sleep_time = 3*current->sleep_time/4;
-
-	p->used_slice = 0;
-	local_irq_disable();
-	if (unlikely(current->used_slice == -1 || current == rq->idle))
-		p->used_slice = -1;
-	else {
-		int ts = task_timeslice(current, rq);
-		current->used_slice += (ts + 3) / 4;
-		if (current->used_slice >= ts) {
-			current->used_slice = -1;
-			set_need_resched();
-		}
-	}
-	local_irq_enable();
-	preempt_enable();
 }
 
 /*
@@ -1270,55 +1216,32 @@ void fastcall wake_up_new_task(task_t * 
 {
 	unsigned long flags;
 	int this_cpu, cpu;
-	runqueue_t *rq;
-	prio_array_t *array;
-
-	BUG_ON(p->state != TASK_RUNNING);
-
-	p->prio = task_priority(p);
-	p->timestamp = clock_us();
+	runqueue_t *rq, *this_rq;
 
 	rq = task_rq_lock(p, &flags);
-	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
+	this_cpu = smp_processor_id();
 
-	schedstat_inc(rq, wunt_cnt);
+	BUG_ON(p->state != TASK_RUNNING);
 
-	array = rq->active;
-	if (unlikely(p->used_slice == -1)) {
-		p->used_slice = 0;
-		array = rq->expired;
-	} else {
-		int total = task_timeslice(p, rq);
-		int ts = max((total + 3) / 4, MIN_TIMESLICE);
-		ts = min(ts, (int)FORKED_TS_MAX);
-		p->used_slice = total - ts;
-	}
+	schedstat_inc(rq, wunt_cnt);
+	/*
+	 * Forked process gets no burst to prevent fork bombs.
+	 */
+	p->burst = 0;
+	current->flags |= PF_FORKED;
 
 	if (likely(cpu == this_cpu)) {
-		if (!(clone_flags & CLONE_VM) && likely(array == rq->active)) {
-			/*
-			 * The VM isn't cloned, so we're in a good position to
-			 * do child-runs-first in anticipation of an exec. This
-			 * usually avoids a lot of COW overhead.
-			 */
-			if (p->prio >= current->prio) {
-				p->prio = current->prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				rq->nr_running++;
-			} else
-				__activate_task(p, rq, array);
-
-			set_need_resched();
-		} else {
-			/* Run child last */
-			__activate_task(p, rq, array);
-		}
-#ifdef CONFIG_SMP
+		__activate_task(p, rq);
+		/*
+		 * We skip the following code due to cpu == this_cpu
+	 	 *
+		 *   task_rq_unlock(rq, &flags);
+		 *   this_rq = task_rq_lock(current, &flags);
+		 */
+		this_rq = rq;
 	} else {
-		runqueue_t *this_rq = this_rq();
+		this_rq = cpu_rq(this_cpu);
 
 		/*
 		 * Not the local CPU - must adjust timestamp. This should
@@ -1326,18 +1249,15 @@ void fastcall wake_up_new_task(task_t * 
 		 */
 		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
 					+ rq->timestamp_last_tick;
-		__activate_task(p, rq, array);
-		if (TASK_PREEMPTS_CURR(p, rq))
+		__activate_task(p, rq);
+		if (task_preempts_curr(p, rq))
 			resched_task(rq->curr);
 
 		schedstat_inc(rq, wunt_moved);
-#endif
+		task_rq_unlock(rq, &flags);
+		this_rq = task_rq_lock(current, &flags);
 	}
-	task_rq_unlock(rq, &flags);
-}
-
-void fastcall sched_exit(task_t * p)
-{
+	task_rq_unlock(this_rq, &flags);
 }
 
 /**
@@ -1638,25 +1558,21 @@ out:
  * Both runqueues must be locked.
  */
 static inline
-void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
-	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+void pull_task(runqueue_t *src_rq, task_t *p,
+		runqueue_t *this_rq, int this_cpu)
 {
-	dequeue_task(p, src_array);
+	dequeue_task(p, src_rq);
 	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
-	enqueue_task(p, this_array);
-	if (!rt_task(p)) {
-		if (p->prio < this_array->min_prio)
-			this_array->min_prio = p->prio;
-	}
+	enqueue_task(p, this_rq);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	if (TASK_PREEMPTS_CURR(p, this_rq))
+	if (task_preempts_curr(p, this_rq))
 		resched_task(this_rq->curr);
 }
 
@@ -1699,7 +1615,6 @@ static int move_tasks(runqueue_t *this_r
 		      unsigned long max_nr_move, struct sched_domain *sd,
 		      enum idle_type idle)
 {
-	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
 	int idx, pulled = 0;
 	task_t *tmp;
@@ -1707,38 +1622,17 @@ static int move_tasks(runqueue_t *this_r
 	if (max_nr_move <= 0 || busiest->nr_running <= 1)
 		goto out;
 
-	/*
-	 * We first consider expired tasks. Those will likely not be
-	 * executed in the near future, and they are most likely to
-	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them.
-	 */
-	if (busiest->expired->nr_active) {
-		array = busiest->expired;
-		dst_array = this_rq->expired;
-	} else {
-		array = busiest->active;
-		dst_array = this_rq->active;
-	}
-
-new_array:
 	/* Start searching at priority 0: */
 	idx = 0;
 skip_bitmap:
 	if (!idx)
-		idx = sched_find_first_bit(array->bitmap);
+		idx = sched_find_first_bit(busiest->bitmap);
 	else
-		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
-	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired && busiest->active->nr_active) {
-			array = busiest->active;
-			dst_array = this_rq->active;
-			goto new_array;
-		}
+		idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO)
 		goto out;
-	}
 
-	head = array->queue + idx;
+	head = busiest->queue + idx;
 	curr = head->prev;
 skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
@@ -1760,7 +1654,7 @@ skip_queue:
 	schedstat_inc(this_rq, pt_gained[idle]);
 	schedstat_inc(busiest, pt_lost[idle]);
 
-	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pull_task(busiest, tmp, this_rq, this_cpu);
 	pulled++;
 
 	/* We only want to steal up to the prescribed number of tasks. */
@@ -1953,6 +1847,7 @@ static int load_balance(int this_cpu, ru
 	unsigned long imbalance;
 	int nr_moved;
 
+	spin_lock(&this_rq->lock);
 	schedstat_inc(sd, lb_cnt[idle]);
 
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
@@ -1987,11 +1882,12 @@ static int load_balance(int this_cpu, ru
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
-		double_rq_lock(this_rq, busiest);
+		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 						imbalance, sd, idle);
-		double_rq_unlock(this_rq, busiest);
+		spin_unlock(&busiest->lock);
 	}
+	spin_unlock(&this_rq->lock);
 
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
@@ -2025,6 +1921,8 @@ static int load_balance(int this_cpu, ru
 	return nr_moved;
 
 out_balanced:
+	spin_unlock(&this_rq->lock);
+
 	/* tune up the balancing interval */
 	if (sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
@@ -2258,16 +2156,12 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  */
 void scheduler_tick(int user_ticks, int sys_ticks)
 {
-	enum idle_type cpu_status;
 	int cpu = smp_processor_id();
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
-	int ts;
 
-#ifdef CONFIG_SMP
-	rq->timestamp_last_tick = clock_us();
-#endif
+	rq->timestamp_last_tick = sched_clock();
 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
@@ -2281,7 +2175,6 @@ void scheduler_tick(int user_ticks, int 
 		sys_ticks = 0;
 	}
 
-	cpu_status = NOT_IDLE;
 	if (p == rq->idle) {
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait += sys_ticks;
@@ -2289,63 +2182,73 @@ void scheduler_tick(int user_ticks, int 
 			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
-		cpu_status = IDLE;
-		goto out;
+		rebalance_tick(cpu, rq, IDLE);
+		return;
 	}
 	if (TASK_NICE(p) > 0)
 		cpustat->nice += user_ticks;
 	else
 		cpustat->user += user_ticks;
 	cpustat->system += sys_ticks;
-
-	/* Task might have expired already, but not scheduled off yet */
-	if (unlikely(p->used_slice == -1))
-		goto out;
-
+	/*
+	 * SCHED_FIFO tasks never run out of timeslice.
+	 */
 	if (unlikely(p->policy == SCHED_FIFO))
 		goto out;
 
-	/* p was running during this tick. Update its time slice counter. */
-	p->used_slice++;
-	ts = task_timeslice(p, rq);
-	if (unlikely(p->used_slice >= ts)) {
-		p->used_slice = -1;
+	spin_lock(&rq->lock);
+	rq->cache_ticks++;
+	/*
+	 * Tasks lose burst each time they use up a full slice().
+	 */
+	if (!--p->slice) {
 		set_tsk_need_resched(p);
+		dequeue_task(p, rq);
+		dec_burst(p);
+		p->slice = slice(p);
+		p->prio = effective_prio(p);
+		p->time_slice = rr_interval(p);
+		enqueue_task(p, rq);
+		goto out_unlock;
 	}
-
+	/*
+	 * Tasks that run out of time_slice but still have slice left get
+	 * requeued with a lower priority && rr_interval time_slice.
+	 */
+	if (!--p->time_slice) {
+		set_tsk_need_resched(p);
+		dequeue_task(p, rq);
+		p->prio = effective_prio(p);
+		p->time_slice = rr_interval(p);
+		enqueue_task(p, rq);
+		goto out_unlock;
+	}
+	if (rq->preempted && rq->cache_ticks >= cache_delay)
+		set_tsk_need_resched(p);
+out_unlock:
+	spin_unlock(&rq->lock);
 out:
-	rebalance_tick(cpu, rq, cpu_status);
+	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
 #ifdef CONFIG_SCHED_SMT
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
 {
-	struct sched_domain *sd = this_rq->sd;
-	cpumask_t sibling_map;
 	int i;
+	struct sched_domain *sd = rq->sd;
+	cpumask_t sibling_map;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return;
 
-	/*
-	 * Unlock the current runqueue because we have to lock in
-	 * CPU order to avoid deadlocks. Caller knows that we might
-	 * unlock. We keep IRQs disabled.
-	 */
-	spin_unlock(&this_rq->lock);
-
 	cpus_and(sibling_map, sd->span, cpu_online_map);
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq;
 
-	for_each_cpu_mask(i, sibling_map)
-		spin_lock(&cpu_rq(i)->lock);
-	/*
-	 * We clear this CPU from the mask. This both simplifies the
-	 * inner loop and keps this_rq locked when we exit:
-	 */
-	cpu_clear(this_cpu, sibling_map);
+		if (i == cpu)
+			continue;
 
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq = cpu_rq(i);
+		smt_rq = cpu_rq(i);
 
 		/*
 		 * If an SMT sibling task is sleeping due to priority
@@ -2354,53 +2257,27 @@ static inline void wake_sleeping_depende
 		if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
 			resched_task(smt_rq->idle);
 	}
-
-	for_each_cpu_mask(i, sibling_map)
-		spin_unlock(&cpu_rq(i)->lock);
-	/*
-	 * We exit with this_cpu's rq still held and IRQs
-	 * still disabled:
-	 */
 }
 
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
 {
-	struct sched_domain *sd = this_rq->sd;
+	struct sched_domain *sd = rq->sd;
 	cpumask_t sibling_map;
-	prio_array_t *array;
 	int ret = 0, i;
-	task_t *p;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return 0;
 
-	/*
-	 * The same locking rules and details apply as for
-	 * wake_sleeping_dependent():
-	 */
-	spin_unlock(&this_rq->lock);
 	cpus_and(sibling_map, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, sibling_map)
-		spin_lock(&cpu_rq(i)->lock);
-	cpu_clear(this_cpu, sibling_map);
-
-	/*
-	 * Establish next task to be run - it might have gone away because
-	 * we released the runqueue lock above:
-	 */
-	if (!this_rq->nr_running)
-		goto out_unlock;
-	array = this_rq->active;
-	if (!array->nr_active)
-		array = this_rq->expired;
-	BUG_ON(!array->nr_active);
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq;
+		task_t *smt_curr;
 
-	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
-		task_t, run_list);
+		if (i == cpu)
+			continue;
 
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq = cpu_rq(i);
-		task_t *smt_curr = smt_rq->curr;
+		smt_rq = cpu_rq(i);
+		smt_curr = smt_rq->curr;
 
 		/*
 		 * If a user task with lower static priority than the
@@ -2410,9 +2287,8 @@ static inline int dependent_sleeper(int 
 		 * task from using an unfair proportion of the
 		 * physical cpu's resources. -ck
 		 */
-		if (((task_timeslice(smt_curr, smt_rq)
-					* (100 - sd->per_cpu_gain) / 100) >
-			task_timeslice(p, this_rq) || rt_task(smt_curr)) &&
+		if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) >
+			slice(p) || rt_task(smt_curr)) &&
 			p->mm && smt_curr->mm && !rt_task(p))
 				ret = 1;
 
@@ -2421,23 +2297,20 @@ static inline int dependent_sleeper(int 
 		 * or wake it up if it has been put to sleep for priority
 		 * reasons.
 		 */
-		if ((((task_timeslice(p, this_rq) * (100-sd->per_cpu_gain)/100)
-			> task_timeslice(smt_curr, smt_rq) || rt_task(p)) &&
+		if ((((p->slice * (100 - sd->per_cpu_gain) / 100) >
+			slice(smt_curr) || rt_task(p)) &&
 			smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
 			(smt_curr == smt_rq->idle && smt_rq->nr_running))
 				resched_task(smt_curr);
 	}
-out_unlock:
-	for_each_cpu_mask(i, sibling_map)
-		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
 #else
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
 {
 }
 
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
 {
 	return 0;
 }
@@ -2451,10 +2324,8 @@ asmlinkage void __sched schedule(void)
 	long *switch_count;
 	task_t *prev, *next;
 	runqueue_t *rq;
-	prio_array_t *array;
 	struct list_head *queue;
 	unsigned long long now;
-	unsigned long run_time;
 	int cpu, idx;
 
 	/*
@@ -2462,10 +2333,11 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (unlikely(in_atomic()) &&
-			likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
-		printk(KERN_ERR "bad: scheduling while atomic!\n");
-		dump_stack();
+	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
+		if (unlikely(in_atomic())) {
+			printk(KERN_ERR "bad: scheduling while atomic!\n");
+			dump_stack();
+		}
 	}
 
 need_resched:
@@ -2473,21 +2345,10 @@ need_resched:
 	prev = current;
 	rq = this_rq();
 
-	/*
-	 * The idle thread is not allowed to schedule!
-	 * Remove this check after it has been exercised a bit.
-	 */
-	if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) {
-		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-		dump_stack();
-	}
-
 	release_kernel_lock(prev);
 	schedstat_inc(rq, sched_cnt);
-	now = clock_us();
-	run_time = now - prev->timestamp;
-	prev->timestamp = now;
-	add_task_time(prev, run_time, STIME_RUN);
+	now = sched_clock();
+	prev->runtime = now - prev->timestamp;
 
 	spin_lock_irq(&rq->lock);
 
@@ -2501,91 +2362,47 @@ need_resched:
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
-		else {
+		else
 			deactivate_task(prev, rq);
-			if (prev->state == TASK_UNINTERRUPTIBLE)
-				rq->nr_uninterruptible++;
-			goto no_check_expired;
-		}
 	}
 
-	if (unlikely(prev->used_slice == -1)) {
-		if (rt_task(prev)) {
-			if (prev->policy == SCHED_RR) {
-				dequeue_task(prev, prev->array);
-				enqueue_task(prev, rq->active);
-			}
-		} else {
-			dequeue_task(prev, prev->array);
-			prev->prio = task_priority(prev);
-			enqueue_task(prev, rq->expired);
-			if (prev->prio < rq->expired->min_prio)
-				rq->expired->min_prio = prev->prio;
-		}
-		prev->used_slice = 0;
-	}
-no_check_expired:
-
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
-go_idle:
-		rq->array_sequence++;
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
-			rq->arrays[0].min_prio = MAX_PRIO;
-			rq->arrays[1].min_prio = MAX_PRIO;
-
-			wake_sleeping_dependent(cpu, rq);
-			/*
-			 * wake_sleeping_dependent() might have released
-			 * the runqueue, so break out if we got new
-			 * tasks meanwhile:
-			 */
-			if (!rq->nr_running)
-				goto switch_tasks;
-		}
-	} else {
-		if (dependent_sleeper(cpu, rq)) {
-			schedstat_inc(rq, sched_goidle);
-			next = rq->idle;
 			goto switch_tasks;
 		}
-		/*
-		 * dependent_sleeper() releases and reacquires the runqueue
-		 * lock, hence go into the idle loop if the rq went
-		 * empty meanwhile:
-		 */
-		if (unlikely(!rq->nr_running))
-			goto go_idle;
 	}
+	schedstat_inc(rq, sched_noswitch);
 
-	array = rq->active;
-	if (unlikely(!array->nr_active)) {
-		/*
-		 * Switch the active and expired arrays.
-		 */
-		schedstat_inc(rq, sched_switch);
-		rq->array_sequence++;
-		rq->active = rq->expired;
-		rq->expired = array;
-		rq->expired->min_prio = MAX_PRIO;
-		array = rq->active;
-	} else
-		schedstat_inc(rq, sched_noswitch);
-
-	idx = sched_find_first_bit(array->bitmap);
-	queue = array->queue + idx;
+	idx = sched_find_first_bit(rq->bitmap);
+	queue = rq->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
+	if (dependent_sleeper(cpu, rq, next)) {
+		schedstat_inc(rq, sched_goidle);
+		next = rq->idle;
+	}
+
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
+	prev->timestamp = now;
+	if (next->flags & PF_YIELDED) {
+		next->flags &= ~PF_YIELDED;
+		dequeue_task(next, rq);
+		next->prio = effective_prio(next);
+		enqueue_task_head(next, rq);
+	}
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
+		rq->preempted = 0;
+		rq->cache_ticks = 0;
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
@@ -2847,9 +2664,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
-	prio_array_t *array;
 	runqueue_t *rq;
-	int old_prio, new_prio, delta;
+	int queued, old_prio, new_prio, delta;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -2868,9 +2684,8 @@ void set_user_nice(task_t *p, long nice)
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	array = p->array;
-	if (array)
-		dequeue_task(p, array);
+	if ((queued = task_queued(p)))
+		dequeue_task(p, rq);
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
@@ -2878,8 +2693,8 @@ void set_user_nice(task_t *p, long nice)
 	p->static_prio = NICE_TO_PRIO(nice);
 	p->prio += delta;
 
-	if (array) {
-		enqueue_task(p, array);
+	if (queued) {
+		enqueue_task(p, rq);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -2991,7 +2806,7 @@ static inline task_t *find_process_by_pi
 /* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
-	BUG_ON(p->array);
+	BUG_ON(task_queued(p));
 	p->policy = policy;
 	p->rt_priority = prio;
 	if (policy != SCHED_NORMAL)
@@ -3007,8 +2822,7 @@ static int setscheduler(pid_t pid, int p
 {
 	struct sched_param lp;
 	int retval = -EINVAL;
-	int oldprio;
-	prio_array_t *array;
+	int queued, oldprio;
 	unsigned long flags;
 	runqueue_t *rq;
 	task_t *p;
@@ -3069,14 +2883,13 @@ static int setscheduler(pid_t pid, int p
 	if (retval)
 		goto out_unlock;
 
-	array = p->array;
-	if (array)
-		deactivate_task(p, rq);
+	if ((queued = task_queued(p)))
+		deactivate_task(p, task_rq(p));
 	retval = 0;
 	oldprio = p->prio;
 	__setscheduler(p, policy, lp.sched_priority);
-	if (array) {
-		__activate_task(p, rq, array);
+	if (queued) {
+		__activate_task(p, task_rq(p));
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -3085,7 +2898,7 @@ static int setscheduler(pid_t pid, int p
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
+		} else if (task_preempts_curr(p, rq))
 			resched_task(rq->curr);
 	}
 
@@ -3294,38 +3107,29 @@ out_unlock:
 
 /**
  * sys_sched_yield - yield the current processor to other threads.
- *
- * this function yields the current CPU by moving the calling thread
- * to the expired array. If there are no other threads running on this
- * CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
-#ifdef CONFIG_SCHEDSTATS
-	runqueue_t *rq;
-#endif
-
-	local_irq_disable();
-#ifdef CONFIG_SCHEDSTATS
-	rq = this_rq();
+	runqueue_t *rq = this_rq_lock();
 
 	schedstat_inc(rq, yld_cnt);
-	spin_lock(&rq->lock);
-	if (current->array->nr_active == 1) {
-		schedstat_inc(rq, yld_act_empty);
-		if (!rq->expired->nr_active)
-			schedstat_inc(rq, yld_both_empty);
-	} else if (!rq->expired->nr_active)
-		schedstat_inc(rq, yld_exp_empty);
+
+	dequeue_task(current, rq);
+	current->slice = slice(current);
+	current->time_slice = RR_INTERVAL();
+	if (likely(!rt_task(current))) {
+		current->flags |= PF_YIELDED;
+		current->prio = MAX_PRIO - 1;
+	}
+	current->burst = 0;
+	enqueue_task(current, rq);
+
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
-#endif
-	current->used_slice = -1;
-	local_irq_enable();
 
 	schedule();
 
@@ -3442,8 +3246,6 @@ long sys_sched_rr_get_interval(pid_t pid
 	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
-	unsigned long flags;
-	runqueue_t *rq;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -3458,9 +3260,8 @@ long sys_sched_rr_get_interval(pid_t pid
 	if (retval)
 		goto out_unlock;
 
-	rq = task_rq_lock(p, &flags);
-	jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t);
-	task_rq_unlock(rq, &flags);
+	jiffies_to_timespec(p->policy & SCHED_FIFO ?
+				0 : slice(p), &t);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -3573,10 +3374,9 @@ void __devinit init_idle(task_t *idle, i
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	idle->array = NULL;
 	idle->prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
-	idle->used_slice = 0;
+	idle->burst = 0;
 	set_task_cpu(idle, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
@@ -3690,7 +3490,7 @@ static void __migrate_task(struct task_s
 		goto out;
 
 	set_task_cpu(p, dest_cpu);
-	if (p->array) {
+	if (task_queued(p)) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
 		 * The same thing could be achieved by doing this step
@@ -3701,7 +3501,7 @@ static void __migrate_task(struct task_s
 				+ rq_dest->timestamp_last_tick;
 		deactivate_task(p, rq_src);
 		activate_task(p, rq_dest, 0);
-		if (TASK_PREEMPTS_CURR(p, rq_dest))
+		if (task_preempts_curr(p, rq_dest))
 			resched_task(rq_dest->curr);
 	}
 
@@ -4418,7 +4218,7 @@ int in_sched_functions(unsigned long add
 void __init sched_init(void)
 {
 	runqueue_t *rq;
-	int i, j, k;
+	int i, j;
 
 #ifdef CONFIG_SMP
 	/* Set up an initial dummy domain for early boot */
@@ -4436,15 +4236,16 @@ void __init sched_init(void)
 	sched_group_init.cpumask = CPU_MASK_ALL;
 	sched_group_init.next = &sched_group_init;
 	sched_group_init.cpu_power = SCHED_LOAD_SCALE;
+
+	cache_delay = cache_decay_ticks * 5;
 #endif
 
 	for (i = 0; i < NR_CPUS; i++) {
-		prio_array_t *array;
-
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
-		rq->active = rq->arrays;
-		rq->expired = rq->arrays + 1;
+
+		rq->cache_ticks = 0;
+		rq->preempted = 0;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_init;
@@ -4455,17 +4256,13 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
 		atomic_set(&rq->nr_iowait, 0);
-
-		for (j = 0; j < 2; j++) {
-			array = rq->arrays + j;
-			array->min_prio = MAX_PRIO;
-			for (k = 0; k < MAX_PRIO; k++) {
-				INIT_LIST_HEAD(array->queue + k);
-				__clear_bit(k, array->bitmap);
-			}
-			/* delimiter for bitsearch */
-			__set_bit(MAX_PRIO, array->bitmap);
-		}
+		for (j = 0; j <= MAX_PRIO; j++)
+			INIT_LIST_HEAD(&rq->queue[j]);
+		memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO+1)*sizeof(long));
+		/*
+		 * delimiter for bitsearch
+		 */
+		__set_bit(MAX_PRIO, rq->bitmap);
 	}
 
 	/*
Index: linux-2.6.9-rc1-mm1/kernel/sysctl.c
===================================================================
--- linux-2.6.9-rc1-mm1.orig/kernel/sysctl.c	2004-08-26 22:27:38.000000000 +1000
+++ linux-2.6.9-rc1-mm1/kernel/sysctl.c	2004-08-26 22:46:26.000000000 +1000
@@ -65,9 +65,6 @@ extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
-extern int sched_base_timeslice;
-extern int sched_min_base;
-extern int sched_max_base;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__)
 int unknown_nmi_panic;
@@ -623,17 +620,21 @@ static ctl_table kern_table[] = {
 	},
 #endif
 	{
-		.ctl_name	= KERN_SCHED_TIMESLICE,
-		.procname	= "base_timeslice",
-		.data		= &sched_base_timeslice,
-		.maxlen		= sizeof (sched_base_timeslice),
+		.ctl_name	= KERN_INTERACTIVE,
+		.procname	= "interactive",
+		.data		= &sched_interactive,
+		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &sched_min_base,
-		.extra2		= &sched_max_base,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_COMPUTE,
+		.procname	= "compute",
+		.data		= &sched_compute,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
 	},
-
 	{ .ctl_name = 0 }
 };
 
@@ -925,7 +926,6 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.9-rc1-mm1/mm/oom_kill.c
===================================================================
--- linux-2.6.9-rc1-mm1.orig/mm/oom_kill.c	2004-08-26 22:27:38.000000000 +1000
+++ linux-2.6.9-rc1-mm1/mm/oom_kill.c	2004-08-26 22:31:29.000000000 +1000
@@ -144,10 +144,11 @@ static void __oom_kill_task(task_t *p)
 	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
 
 	/*
-	 * We give our sacrificial lamb access to all the memory it needs.
-	 * That way it should be able to exit() and clear out its resources
-	 * quickly...
+	 * We give our sacrificial lamb high priority and access to
+	 * all the memory it needs. That way it should be able to
+	 * exit() and clear out its resources quickly...
 	 */
+	p->time_slice = HZ;
 	p->flags |= PF_MEMALLOC | PF_MEMDIE;
 
 	/* This process has hardware access, be more careful. */
More recent patches modify files in 296rc1mm1-nosmtfix-nonicksched-s8.1.
