Implement the "staircase" hybrid foreground-background single priority
array cpu scheduler policy.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
---
 fs/proc/array.c       |    4 
 include/linux/sched.h |   20 
 kernel/exit.c         |    1 
 kernel/sched.c        | 1107 ++++++++++++++++++--------------------------------
 4 files changed, 418 insertions(+), 714 deletions(-)

Index: linux-2.6.21-rc7-ck1/fs/proc/array.c
===================================================================
--- linux-2.6.21-rc7-ck1.orig/fs/proc/array.c	2007-04-16 20:35:14.000000000 +1000
+++ linux-2.6.21-rc7-ck1/fs/proc/array.c	2007-04-16 20:35:18.000000000 +1000
@@ -165,7 +165,7 @@ static inline char * task_state(struct t
 	rcu_read_lock();
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
-		"SleepAVG:\t%lu%%\n"
+		"Bonus:\t%d\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -173,7 +173,7 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		(p->sleep_avg/1024)*100/(1020000000/1024),
+		p->bonus,
 	       	p->tgid, p->pid,
 	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
Index: linux-2.6.21-rc7-ck1/kernel/exit.c
===================================================================
--- linux-2.6.21-rc7-ck1.orig/kernel/exit.c	2007-04-16 20:35:15.000000000 +1000
+++ linux-2.6.21-rc7-ck1/kernel/exit.c	2007-04-16 20:35:18.000000000 +1000
@@ -170,7 +170,6 @@ repeat:
 		zap_leader = (leader->exit_signal == -1);
 	}
 
-	sched_exit(p);
 	write_unlock_irq(&tasklist_lock);
 	proc_flush_task(p);
 	release_thread(p);
Index: linux-2.6.21-rc7-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc7-ck1.orig/include/linux/sched.h	2007-04-16 20:35:15.000000000 +1000
+++ linux-2.6.21-rc7-ck1/include/linux/sched.h	2007-04-16 20:35:18.000000000 +1000
@@ -524,6 +524,7 @@ struct signal_struct {
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define MIN_USER_PRIO		(MAX_PRIO - 1)
 
 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
@@ -788,15 +789,6 @@ struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 
-enum sleep_type {
-	SLEEP_NORMAL,
-	SLEEP_NONINTERACTIVE,
-	SLEEP_INTERACTIVE,
-	SLEEP_INTERRUPTED,
-};
-
-struct prio_array;
-
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -814,20 +806,19 @@ struct task_struct {
 	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio, normal_prio;
 	struct list_head run_list;
-	struct prio_array *array;
 
 	unsigned short ioprio;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
-	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
+	unsigned long runtime, totalrun, systime;
+	unsigned int bonus;
+	unsigned int slice, time_slice;
 	unsigned long long sched_time; /* sched_clock time spent running */
-	enum sleep_type sleep_type;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
@@ -1158,6 +1149,8 @@ static inline void put_task_struct(struc
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
+#define PF_NONSLEEP	0x40000000	/* Waiting on in kernel activity */
+#define PF_FORKED	0x80000000	/* Task just forked another process */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -1292,7 +1285,6 @@ extern void FASTCALL(wake_up_new_task(st
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
 extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
-extern void FASTCALL(sched_exit(struct task_struct * p));
 
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
Index: linux-2.6.21-rc7-ck1/kernel/sched.c
===================================================================
--- linux-2.6.21-rc7-ck1.orig/kernel/sched.c	2007-04-16 20:35:15.000000000 +1000
+++ linux-2.6.21-rc7-ck1/kernel/sched.c	2007-04-16 20:35:18.000000000 +1000
@@ -16,6 +16,10 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2007-04-16	Staircase scheduling policy by Con Kolivas with help
+ *		from William Lee Irwin III, Zwane Mwaikambo, Peter Williams
+ *		and Andreas Mohr.
+ *		Staircase v17.1
  */
 
 #include <linux/mm.h>
@@ -87,123 +91,19 @@ unsigned long long __attribute__((weak))
 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
  */
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
-
-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
- * Timeslices get refilled after they expire.
- */
-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
-#define DEF_TIMESLICE		(100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT	 30
-#define CHILD_PENALTY		 95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		  3
-#define PRIO_BONUS_RATIO	 25
-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA	  2
-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
-		MAX_SLEEP_AVG)
-
-#define GRANULARITY	(10 * HZ / 1000 ? : 1)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
-			num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
-		INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+#define NSJIFFY			(1000000000 / HZ)	/* One jiffy in ns */
+#define NS_TO_JIFFIES(TIME)	((TIME) / NSJIFFY)
+#define JIFFIES_TO_NS(TIME)	((TIME) * NSJIFFY)
 
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 
-#define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-static unsigned int static_prio_timeslice(int static_prio)
-{
-	if (static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
-	else
-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
-
 /*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
+ * This is the time all tasks within the same priority round robin.
+ * Set to a minimum of 6ms.
  */
-
-static inline unsigned int task_timeslice(struct task_struct *p)
-{
-	return static_prio_timeslice(p->static_prio);
-}
-
-/*
- * These are the runqueue data structures:
- */
-
-struct prio_array {
-	unsigned int nr_active;
-	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
-	struct list_head queue[MAX_PRIO];
-};
+#define RR_INTERVAL		((6 * HZ / 1001) + 1)
+#define DEF_TIMESLICE		(RR_INTERVAL * 19)
 
 /*
  * This is the main, per-CPU runqueue data structure.
@@ -234,14 +134,13 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
-	unsigned long expired_timestamp;
 	/* Cached timestamp set by update_cpu_clock() */
 	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
-	struct prio_array *active, *expired, arrays[2];
-	int best_expired_prio;
+	unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
+	struct list_head queue[MAX_PRIO];
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
@@ -578,13 +477,7 @@ static inline struct rq *this_rq_lock(vo
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
- * Called when a process is dequeued from the active array and given
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
+ * Called when a process is dequeued and given the cpu.
  *
  * This function is only called from sched_info_arrive(), rather than
  * dequeue_task(). Even though a task may be queued and dequeued multiple
@@ -617,13 +510,11 @@ static void sched_info_arrive(struct tas
 }
 
 /*
- * Called when a process is queued into either the active or expired
- * array.  The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu.  Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
+ * Called when a process is queued.
+ * The time is noted and later used to determine how long we had to wait for
+ * us to reach the cpu.
+ * It is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued: this can happen in sched_yield(),
  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
  * to runqueue.
  *
@@ -682,73 +573,75 @@ sched_info_switch(struct task_struct *pr
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
-/*
- * Adding/removing a task to/from a priority array:
- */
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
+#if BITS_PER_LONG < 64
+static inline void longlimit(unsigned long long *longlong)
+{
+	if (*longlong > (1 << 31))
+		*longlong = 1 << 31;
+}
+#else
+static inline void longlimit(unsigned long long *__unused)
+{
+}
+#endif
+
+/* Get nanosecond clock difference without overflowing unsigned long. */
+static unsigned long ns_diff(unsigned long long v1, unsigned long long v2)
 {
-	array->nr_active--;
-	list_del(&p->run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
+	unsigned long long vdiff = 0;
+
+	/* Rarely the clock appears to go backwards. Ignore that */
+	if (likely(v1 > v2)) {
+		vdiff = v1 - v2;
+		longlimit(&vdiff);
+	}
+	return (unsigned long)vdiff;
 }
 
-static void enqueue_task(struct task_struct *p, struct prio_array *array)
+static int task_queued(struct task_struct *task)
 {
-	sched_info_queued(p);
-	list_add_tail(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
-	p->array = array;
+	return !list_empty(&task->run_list);
 }
 
 /*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
+ * Adding/removing a task to/from a runqueue:
  */
-static void requeue_task(struct task_struct *p, struct prio_array *array)
+static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
-	list_move_tail(&p->run_list, array->queue + p->prio);
+	list_del_init(&p->run_list);
+	if (list_empty(rq->queue + p->prio))
+		__clear_bit(p->prio, rq->bitmap);
 }
 
-static inline void
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
+static void enqueue_task(struct task_struct *p, struct rq *rq)
 {
-	list_add(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
-	p->array = array;
+	list_add_tail(&p->run_list, rq->queue + p->prio);
+	__set_bit(p->prio, rq->bitmap);
 }
 
 /*
- * __normal_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
  */
-
-static inline int __normal_prio(struct task_struct *p)
+static void requeue_task(struct task_struct *p, struct rq *rq, int prio)
 {
-	int bonus, prio;
-
-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+	list_move_tail(&p->run_list, rq->queue + prio);
+	if (p->prio != prio) {
+		if (list_empty(rq->queue + p->prio))
+			__clear_bit(p->prio, rq->bitmap);
+		p->prio = prio;
+		__set_bit(prio, rq->bitmap);
+	}
+}
 
-	prio = p->static_prio - bonus;
-	if (prio < MAX_RT_PRIO)
-		prio = MAX_RT_PRIO;
-	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
-	return prio;
+static void enqueue_task_head(struct task_struct *p, struct rq *rq)
+{
+	list_add(&p->run_list, rq->queue + p->prio);
+	__set_bit(p->prio, rq->bitmap);
 }
 
+static unsigned int slice(const struct task_struct *p);
+
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -766,10 +659,9 @@ static inline int __normal_prio(struct t
 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 #define LOAD_WEIGHT(lp) \
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
-	LOAD_WEIGHT(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
-	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+#define TASK_LOAD_WEIGHT(p)	LOAD_WEIGHT(slice(p))
+#define RTPRIO_TO_LOAD_WEIGHT(rp)	\
+	(LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -786,7 +678,7 @@ static void set_load_weight(struct task_
 #endif
 			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
 	} else
-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+		p->load_weight = TASK_LOAD_WEIGHT(p);
 }
 
 static inline void
@@ -814,13 +706,189 @@ static inline void dec_nr_running(struct
 }
 
 /*
+ * __activate_task - move a task to the runqueue.
+ */
+static void __activate_task(struct task_struct *p, struct rq *rq)
+{
+	enqueue_task(p, rq);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static void __activate_idle_task(struct task_struct *p, struct rq *rq)
+{
+	enqueue_task_head(p, rq);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * Bonus - How much higher than its base priority an interactive task can run.
+ */
+static unsigned int bonus(const struct task_struct *p)
+{
+	return TASK_USER_PRIO(p);
+}
+
+static unsigned int rr_interval(const struct task_struct *p)
+{
+	int nice = TASK_NICE(p);
+
+	if (nice < 0 && !rt_task(p))
+		return RR_INTERVAL * (20 - nice) / 20;
+	return RR_INTERVAL;
+}
+
+/*
+ * slice - the duration a task runs before getting requeued at its best
+ * priority and has its bonus decremented.
+ */
+static unsigned int slice(const struct task_struct *p)
+{
+	unsigned int slice, rr;
+
+	slice = rr = rr_interval(p);
+	if (likely(!rt_task(p)))
+		slice += (39 - TASK_USER_PRIO(p)) * rr;
+	return slice;
+}
+
+/*
+ * We increase our bonus by sleeping more than the time we ran.
+ * The ratio of sleep to run gives us the cpu% that we last ran and determines
+ * the maximum bonus we can acquire.
+ */
+static void inc_bonus(struct task_struct *p, unsigned long totalrun, unsigned long sleep)
+{
+	unsigned int best_bonus = sleep / (totalrun + 1);
+
+	if (p->bonus >= best_bonus)
+		return;
+	best_bonus = bonus(p);
+	if (p->bonus < best_bonus)
+		p->bonus++;
+}
+
+static void dec_bonus(struct task_struct *p)
+{
+	if (p->bonus)
+		p->bonus--;
+}
+
+static void slice_overrun(struct task_struct *p)
+{
+	unsigned long ns_slice = JIFFIES_TO_NS(p->slice);
+
+	do {
+		p->totalrun -= ns_slice;
+		dec_bonus(p);
+	} while (unlikely(p->totalrun > ns_slice));
+}
+
+static void continue_slice(struct task_struct *p)
+{
+	unsigned long total_run = NS_TO_JIFFIES(p->totalrun);
+
+	if (unlikely(total_run >= p->slice))
+		slice_overrun(p);
+	else {
+		unsigned long remainder;
+
+		p->slice -= total_run;
+		remainder = p->slice % rr_interval(p);
+		if (remainder)
+			p->time_slice = remainder;
+	}
+}
+
+/*
+ * recalc_task_prio - this checks for tasks that have run less than a full
+ * slice and have woken up again soon after, or have just forked a
+ * thread/process and make them continue their old slice instead of starting
+ * a new one at high priority.
+ */
+static void recalc_task_prio(struct task_struct *p, unsigned long long now)
+{
+	unsigned long sleep_time;
+
+	/*
+	 * If this task has managed to run to its lowest priority then
+	 * decrease its bonus and requeue it now at best priority instead
+	 * of possibly flagging around lowest priority. Save up any systime
+	 * that may affect priority on the next reschedule.
+	 */
+	if (p->slice > p->time_slice &&
+	    p->slice - NS_TO_JIFFIES(p->totalrun) < p->time_slice) {
+		dec_bonus(p);
+		p->totalrun = 0;
+		return;
+	}
+
+	/*
+	 * Add the total for this last scheduled run (p->runtime) and system
+	 * time (p->systime) done on behalf of p to the running total so far
+	 * used (p->totalrun).
+	 */
+	p->totalrun += p->runtime + p->systime;
+	sleep_time = ns_diff(now, p->timestamp);
+
+	if (p->systime > sleep_time || p->flags & PF_FORKED)
+		sleep_time = 0;
+	else {
+		sleep_time -= p->systime;
+		/*
+		 * We elevate priority by the amount of time we slept. If we
+		 * sleep longer than our running total and have not set the
+		 * PF_NONSLEEP flag we gain a bonus.
+		 */
+		if (sleep_time >= p->totalrun) {
+			if (!(p->flags & PF_NONSLEEP))
+				inc_bonus(p, p->totalrun, sleep_time);
+			p->totalrun = 0;
+			return;
+		}
+		p->totalrun -= sleep_time;
+	}
+	continue_slice(p);
+}
+
+/*
+ * __normal_prio - dynamic priority dependent on bonus.
+ * The priority normally decreases by one each RR_INTERVAL.
+ * As the bonus increases the initial priority starts at a higher "stair" or
+ * priority for longer.
+ */
+static int __normal_prio(struct task_struct *p)
+{
+	int prio;
+	unsigned int full_slice, used_slice = 0;
+	unsigned int best_bonus, rr;
+
+	full_slice = slice(p);
+	if (full_slice > p->slice)
+		used_slice = full_slice - p->slice;
+
+	best_bonus = bonus(p);
+	prio = MAX_RT_PRIO + best_bonus;
+	if (!batch_task(p))
+		prio -= p->bonus;
+
+	rr = rr_interval(p);
+	prio += used_slice / rr;
+	if (prio > MIN_USER_PRIO)
+		prio = MIN_USER_PRIO;
+	return prio;
+}
+
+/*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
-static inline int normal_prio(struct task_struct *p)
+static int normal_prio(struct task_struct *p)
 {
 	int prio;
 
@@ -852,111 +920,14 @@ static int effective_prio(struct task_st
 }
 
 /*
- * __activate_task - move a task to the runqueue.
- */
-static void __activate_task(struct task_struct *p, struct rq *rq)
-{
-	struct prio_array *target = rq->active;
-
-	if (batch_task(p))
-		target = rq->expired;
-	enqueue_task(p, target);
-	inc_nr_running(p, rq);
-}
-
-/*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
-{
-	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
-}
-
-/*
- * Recalculate p->normal_prio and p->prio after having slept,
- * updating the sleep-average too:
- */
-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
-{
-	/* Caller must always ensure 'now >= p->timestamp' */
-	unsigned long sleep_time = now - p->timestamp;
-
-	if (batch_task(p))
-		sleep_time = 0;
-
-	if (likely(sleep_time > 0)) {
-		/*
-		 * This ceiling is set to the lowest priority that would allow
-		 * a task to be reinserted into the active array on timeslice
-		 * completion.
-		 */
-		unsigned long ceiling = INTERACTIVE_SLEEP(p);
-
-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
-			/*
-			 * Prevents user tasks from achieving best priority
-			 * with one single large enough sleep.
-			 */
-			p->sleep_avg = ceiling;
-			/*
-			 * Using INTERACTIVE_SLEEP() as a ceiling places a
-			 * nice(0) task 1ms sleep away from promotion, and
-			 * gives it 700ms to round-robin with no chance of
-			 * being demoted.  This is more than generous, so
-			 * mark this sleep as non-interactive to prevent the
-			 * on-runqueue bonus logic from intervening should
-			 * this task not receive cpu immediately.
-			 */
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-		} else {
-			/*
-			 * Tasks waking from uninterruptible sleep are
-			 * limited in their sleep_avg rise as they
-			 * are likely to be waiting on I/O
-			 */
-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
-				if (p->sleep_avg >= ceiling)
-					sleep_time = 0;
-				else if (p->sleep_avg + sleep_time >=
-					 ceiling) {
-						p->sleep_avg = ceiling;
-						sleep_time = 0;
-				}
-			}
-
-			/*
-			 * This code gives a bonus to interactive tasks.
-			 *
-			 * The boost works by updating the 'average sleep time'
-			 * value here, based on ->timestamp. The more time a
-			 * task spends sleeping, the higher the average gets -
-			 * and the higher the priority boost gets as well.
-			 */
-			p->sleep_avg += sleep_time;
-
-		}
-		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
-			p->sleep_avg = NS_MAX_SLEEP_AVG;
-	}
-
-	return effective_prio(p);
-}
-
-/*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
  */
 static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
-	unsigned long long now;
-
-	if (rt_task(p))
-		goto out;
+	unsigned long long now = sched_clock();
+	unsigned long rr = rr_interval(p);
 
-	now = sched_clock();
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
@@ -977,32 +948,15 @@ static void activate_task(struct task_st
 				     (now - p->timestamp) >> 20);
 	}
 
-	p->prio = recalc_task_prio(p, now);
-
-	/*
-	 * This checks to make sure it's not an uninterruptible task
-	 * that is now waking up.
-	 */
-	if (p->sleep_type == SLEEP_NORMAL) {
-		/*
-		 * Tasks which were woken up by interrupts (ie. hw events)
-		 * are most likely of interactive nature. So we give them
-		 * the credit of extending their sleep time to the period
-		 * of time they spend on the runqueue, waiting for execution
-		 * on a CPU, first time around:
-		 */
-		if (in_interrupt())
-			p->sleep_type = SLEEP_INTERRUPTED;
-		else {
-			/*
-			 * Normal first-time wakeups get a credit too for
-			 * on-runqueue time, but it will be weighted down:
-			 */
-			p->sleep_type = SLEEP_INTERACTIVE;
-		}
+	p->slice = slice(p);
+	p->time_slice = p->slice % rr ? : rr;
+	if (!rt_task(p)) {
+		recalc_task_prio(p, now);
+		p->prio = effective_prio(p);
+		p->systime = 0;
+		p->flags &= ~(PF_FORKED | PF_NONSLEEP);
 	}
 	p->timestamp = now;
-out:
 	__activate_task(p, rq);
 }
 
@@ -1012,8 +966,7 @@ out:
 static void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	dec_nr_running(p, rq);
-	dequeue_task(p, p->array);
-	p->array = NULL;
+	dequeue_task(p, rq);
 }
 
 /*
@@ -1095,7 +1048,7 @@ migrate_task(struct task_struct *p, int 
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->array && !task_running(rq, p)) {
+	if (!task_queued(p) && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
@@ -1126,7 +1079,7 @@ void wait_task_inactive(struct task_stru
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
-	if (unlikely(p->array || task_running(rq, p))) {
+	if (unlikely(task_queued(p) || task_running(rq, p))) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);
@@ -1391,6 +1344,16 @@ static inline int wake_idle(int cpu, str
 }
 #endif
 
+/*
+ * Check to see if p preempts rq->curr and resched if it does.
+ */
+static void try_preempt(struct task_struct *p, struct rq *rq)
+{
+	if (TASK_PREEMPTS_CURR(p, rq))
+		resched_task(rq->curr);
+}
+
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -1422,7 +1385,7 @@ static int try_to_wake_up(struct task_st
 	if (!(old_state & state))
 		goto out;
 
-	if (p->array)
+	if (task_queued(p))
 		goto out_running;
 
 	cpu = task_cpu(p);
@@ -1515,7 +1478,7 @@ out_set_cpu:
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
-		if (p->array)
+		if (task_queued(p))
 			goto out_running;
 
 		this_cpu = smp_processor_id();
@@ -1524,26 +1487,10 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
+	if (old_state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
-		/*
-		 * Tasks on involuntary sleep don't earn
-		 * sleep_avg beyond just interactive state.
-		 */
-		p->sleep_type = SLEEP_NONINTERACTIVE;
-	} else
 
 	/*
-	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up with their sleep average not weighted in an
-	 * interactive way.
-	 */
-		if (old_state & TASK_NONINTERACTIVE)
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-
-
-	activate_task(p, rq, cpu == this_cpu);
-	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
 	 * don't trigger a preemption, if the woken up task will run on
@@ -1551,10 +1498,9 @@ out_activate:
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
-	if (!sync || cpu != this_cpu) {
-		if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
-	}
+	activate_task(p, rq, cpu == this_cpu);
+	if (!sync || cpu != this_cpu)
+		try_preempt(p, rq);
 	success = 1;
 
 out_running:
@@ -1577,7 +1523,6 @@ int fastcall wake_up_state(struct task_s
 	return try_to_wake_up(p, state, 0);
 }
 
-static void task_running_tick(struct rq *rq, struct task_struct *p);
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1605,7 +1550,6 @@ void fastcall sched_fork(struct task_str
 	p->prio = current->normal_prio;
 
 	INIT_LIST_HEAD(&p->run_list);
-	p->array = NULL;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -1617,30 +1561,6 @@ void fastcall sched_fork(struct task_str
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness.
-	 */
-	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->timestamp = sched_clock();
-	if (unlikely(!current->time_slice)) {
-		/*
-		 * This case is rare, it happens when the parent has only
-		 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
-		 */
-		current->time_slice = 1;
-		task_running_tick(cpu_rq(cpu), current);
-	}
-	local_irq_enable();
 	put_cpu();
 }
 
@@ -1662,38 +1582,20 @@ void fastcall wake_up_new_task(struct ta
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive. The parent
-	 * (current) is done further down, under its lock.
-	 */
-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
-	p->prio = effective_prio(p);
+	/* Forked process gets no bonus to prevent fork bombs. */
+	p->bonus = 0;
+	current->flags |= PF_FORKED;
 
 	if (likely(cpu == this_cpu)) {
+		activate_task(p, rq, 1);
 		if (!(clone_flags & CLONE_VM)) {
 			/*
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
 			 */
-			if (unlikely(!current->array))
-				__activate_task(p, rq);
-			else {
-				p->prio = current->prio;
-				p->normal_prio = current->normal_prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				inc_nr_running(p, rq);
-			}
 			set_need_resched();
-		} else
-			/* Run child last */
-			__activate_task(p, rq);
+		}
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
@@ -1710,53 +1612,19 @@ void fastcall wake_up_new_task(struct ta
 		 */
 		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
 					+ rq->most_recent_timestamp;
-		__activate_task(p, rq);
-		if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
+		activate_task(p, rq, 0);
+		try_preempt(p, rq);
 
 		/*
 		 * Parent and child are on different CPUs, now get the
-		 * parent runqueue to update the parent's ->sleep_avg:
+		 * parent runqueue to update the parent's ->flags:
 		 */
 		task_rq_unlock(rq, &flags);
 		this_rq = task_rq_lock(current, &flags);
 	}
-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	task_rq_unlock(this_rq, &flags);
 }
 
-/*
- * Potentially available exiting-child timeslices are
- * retrieved here - this way the parent does not get
- * penalized for creating too many threads.
- *
- * (this cannot be used to 'generate' timeslices
- * artificially, because any timeslice recovered here
- * was given away by the parent in the first place.)
- */
-void fastcall sched_exit(struct task_struct *p)
-{
-	unsigned long flags;
-	struct rq *rq;
-
-	/*
-	 * If the child was a (relative-) CPU hog then decrease
-	 * the sleep_avg of the parent as well.
-	 */
-	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
-			p->parent->time_slice = task_timeslice(p);
-	}
-	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = p->parent->sleep_avg /
-		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
-		(EXIT_WEIGHT + 1);
-	task_rq_unlock(rq, &flags);
-}
-
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -2085,23 +1953,21 @@ void sched_exec(void)
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
-		      struct task_struct *p, struct rq *this_rq,
-		      struct prio_array *this_array, int this_cpu)
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+		      struct rq *this_rq, int this_cpu)
 {
-	dequeue_task(p, src_array);
+	dequeue_task(p, src_rq);
 	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
-	enqueue_task(p, this_array);
+	enqueue_task(p, this_rq);
 	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
 				+ this_rq->most_recent_timestamp;
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	if (TASK_PREEMPTS_CURR(p, this_rq))
-		resched_task(this_rq->curr);
+	try_preempt(p, this_rq);
 }
 
 /*
@@ -2144,8 +2010,6 @@ int can_migrate_task(struct task_struct 
 	return 1;
 }
 
-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
-
 /*
  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
  * load from busiest to this_rq, as part of a balancing operation within
@@ -2160,7 +2024,6 @@ static int move_tasks(struct rq *this_rq
 {
 	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
 	    best_prio_seen, skip_for_load;
-	struct prio_array *array, *dst_array;
 	struct list_head *head, *curr;
 	struct task_struct *tmp;
 	long rem_load_move;
@@ -2170,8 +2033,8 @@ static int move_tasks(struct rq *this_rq
 
 	rem_load_move = max_load_move;
 	pinned = 1;
-	this_best_prio = rq_best_prio(this_rq);
-	best_prio = rq_best_prio(busiest);
+	this_best_prio = this_rq->curr->prio;
+	best_prio = busiest->curr->prio;
 	/*
 	 * Enable handling of the case where there is more than one task
 	 * with the best priority.   If the current running task is one
@@ -2181,38 +2044,17 @@ static int move_tasks(struct rq *this_rq
 	 */
 	best_prio_seen = best_prio == busiest->curr->prio;
 
-	/*
-	 * We first consider expired tasks. Those will likely not be
-	 * executed in the near future, and they are most likely to
-	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them.
-	 */
-	if (busiest->expired->nr_active) {
-		array = busiest->expired;
-		dst_array = this_rq->expired;
-	} else {
-		array = busiest->active;
-		dst_array = this_rq->active;
-	}
-
-new_array:
 	/* Start searching at priority 0: */
 	idx = 0;
 skip_bitmap:
 	if (!idx)
-		idx = sched_find_first_bit(array->bitmap);
+		idx = sched_find_first_bit(busiest->bitmap);
 	else
-		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
-	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired && busiest->active->nr_active) {
-			array = busiest->active;
-			dst_array = this_rq->active;
-			goto new_array;
-		}
+		idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO)
 		goto out;
-	}
 
-	head = array->queue + idx;
+	head = busiest->queue + idx;
 	curr = head->prev;
 skip_queue:
 	tmp = list_entry(curr, struct task_struct, run_list);
@@ -2237,7 +2079,7 @@ skip_queue:
 		goto skip_bitmap;
 	}
 
-	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pull_task(busiest, tmp, this_rq, this_cpu);
 	pulled++;
 	rem_load_move -= tmp->load_weight;
 
@@ -3013,10 +2855,35 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * This is called on clock ticks and on context switches.
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * The value returned from sched_clock() occasionally gives bogus values so
+ * some sanity checking is required.
  */
-static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+static void
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
+		 int tick)
 {
+	static const unsigned int min_diff = 1000;
+	cputime64_t time_diff = now - p->last_ran;
+
+	if (tick) {
+		/*
+		 * Called from scheduler_tick() there should be less than two
+		 * jiffies worth, and not negative/overflow.
+		 */
+		if (time_diff > JIFFIES_TO_NS(2) || time_diff < min_diff)
+			time_diff = NSJIFFY;
+	} else {
+		/*
+		 * Called from context_switch there should be less than one
+		 * jiffy worth, and not negative/overflowed. In the case when
+		 * sched_clock fails to return high resolution values this
+		 * also ensures at least 1 min_diff gets banked.
+		 */
+		if (time_diff > NSJIFFY || time_diff < min_diff)
+			time_diff = min_diff;
+	}
+	if (p != rq->idle && p->policy != SCHED_FIFO)
+		p->runtime += time_diff;
 	p->sched_time += now - p->last_ran;
 	p->last_ran = rq->most_recent_timestamp = now;
 }
@@ -3038,27 +2905,6 @@ unsigned long long current_sched_time(co
 }
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-static inline int expired_starving(struct rq *rq)
-{
-	if (rq->curr->static_prio > rq->best_expired_prio)
-		return 1;
-	if (!STARVATION_LIMIT || !rq->expired_timestamp)
-		return 0;
-	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
-		return 1;
-	return 0;
-}
-
-/*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3106,6 +2952,7 @@ void account_system_time(struct task_str
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	p->systime += NSJIFFY;
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
@@ -3131,77 +2978,38 @@ void account_steal_time(struct task_stru
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
-static void task_running_tick(struct rq *rq, struct task_struct *p)
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
+{
+	set_tsk_need_resched(p);
+	p->time_slice = rr_interval(p);
+	requeue_task(p, rq, effective_prio(p));
+}
+
+static void task_used_tick(struct rq *rq, struct task_struct *p)
 {
-	if (p->array != rq->active) {
-		/* Task has expired but was not scheduled yet */
+	if (unlikely(!task_queued(p))) {
+		/* Task has expired but was not scheduled off yet */
 		set_tsk_need_resched(p);
 		return;
 	}
 	spin_lock(&rq->lock);
+	p->runtime -= NSJIFFY;
 	/*
-	 * The task was running during this tick - update the
-	 * time slice counter. Note: we do not update a thread's
-	 * priority until it either goes to sleep or uses up its
-	 * timeslice. This makes it possible for interactive tasks
-	 * to use up their timeslices at their highest priority levels.
+	 * Tasks lose bonus each time they use up a full slice().
 	 */
-	if (rt_task(p)) {
-		/*
-		 * RR tasks need a special form of timeslice management.
-		 * FIFO tasks have no timeslices.
-		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
-
-			/* put it at the end of the queue: */
-			requeue_task(p, rq->active);
-		}
+	if (!--p->slice) {
+		dec_bonus(p);
+		p->totalrun = 0;
+		p->slice = slice(p);
+		time_slice_expired(p, rq);
 		goto out_unlock;
 	}
-	if (!--p->time_slice) {
-		dequeue_task(p, rq->active);
-		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!rq->expired_timestamp)
-			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
-			enqueue_task(p, rq->expired);
-			if (p->static_prio < rq->best_expired_prio)
-				rq->best_expired_prio = p->static_prio;
-		} else
-			enqueue_task(p, rq->active);
-	} else {
-		/*
-		 * Prevent a too long timeslice allowing a task to monopolize
-		 * the CPU. We do this by splitting up the timeslice into
-		 * smaller pieces.
-		 *
-		 * Note: this does not mean the task's timeslices expire or
-		 * get lost in any way, they just might be preempted by
-		 * another task of equal priority. (one with higher
-		 * priority would have preempted this task already.) We
-		 * requeue this task to the end of the list on this priority
-		 * level, which is in essence a round-robin of tasks with
-		 * equal priority.
-		 *
-		 * This only applies to tasks in the interactive
-		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
-		 */
-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-			(p->array == rq->active)) {
-
-			requeue_task(p, rq->active);
-			set_tsk_need_resched(p);
-		}
-	}
+	/*
+	 * Tasks that run out of time_slice but still have slice left get
+	 * requeued with a lower priority && RR_INTERVAL time_slice.
+	 */
+	if (!--p->time_slice)
+		time_slice_expired(p, rq);
 out_unlock:
 	spin_unlock(&rq->lock);
 }
@@ -3209,9 +3017,6 @@ out_unlock:
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
  */
 void scheduler_tick(void)
 {
@@ -3220,10 +3025,10 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 
-	update_cpu_clock(p, rq, now);
+	update_cpu_clock(p, rq, now, 1);
+	if (p != rq->idle && p->policy != SCHED_FIFO && p->runtime >= NSJIFFY)
+		task_used_tick(rq, p);
 
-	if (p != rq->idle)
-		task_running_tick(rq, p);
 #ifdef CONFIG_SMP
 	update_load(rq);
 	if (time_after_eq(jiffies, rq->next_balance))
@@ -3269,25 +3074,17 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
-static inline int interactive_sleep(enum sleep_type sleep_type)
-{
-	return (sleep_type == SLEEP_INTERACTIVE ||
-		sleep_type == SLEEP_INTERRUPTED);
-}
-
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
-	struct prio_array *array;
 	struct list_head *queue;
 	unsigned long long now;
-	unsigned long run_time;
-	int cpu, idx, new_prio;
 	long *switch_count;
 	struct rq *rq;
+	int cpu, idx;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -3323,18 +3120,6 @@ need_resched_nonpreemptible:
 
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
-	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
-		run_time = now - prev->timestamp;
-		if (unlikely((long long)(now - prev->timestamp) < 0))
-			run_time = 0;
-	} else
-		run_time = NS_MAX_SLEEP_AVG;
-
-	/*
-	 * Tasks charged proportionately less run_time at high sleep_avg to
-	 * delay them losing their interactive status
-	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
 	spin_lock_irq(&rq->lock);
 
@@ -3345,8 +3130,10 @@ need_resched_nonpreemptible:
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else {
-			if (prev->state == TASK_UNINTERRUPTIBLE)
+			if (prev->state == TASK_UNINTERRUPTIBLE) {
+				prev->flags |= PF_NONSLEEP;
 				rq->nr_uninterruptible++;
+			}
 			deactivate_task(prev, rq);
 		}
 	}
@@ -3356,59 +3143,25 @@ need_resched_nonpreemptible:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
-			rq->expired_timestamp = 0;
 			goto switch_tasks;
 		}
 	}
 
-	array = rq->active;
-	if (unlikely(!array->nr_active)) {
-		/*
-		 * Switch the active and expired arrays.
-		 */
-		schedstat_inc(rq, sched_switch);
-		rq->active = rq->expired;
-		rq->expired = array;
-		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
-	}
-
-	idx = sched_find_first_bit(array->bitmap);
-	queue = array->queue + idx;
+	idx = sched_find_first_bit(rq->bitmap);
+	queue = rq->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
 
-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
-		unsigned long long delta = now - next->timestamp;
-		if (unlikely((long long)(now - next->timestamp) < 0))
-			delta = 0;
-
-		if (next->sleep_type == SLEEP_INTERACTIVE)
-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
-		array = next->array;
-		new_prio = recalc_task_prio(next, next->timestamp + delta);
-
-		if (unlikely(next->prio != new_prio)) {
-			dequeue_task(next, array);
-			next->prio = new_prio;
-			enqueue_task(next, array);
-		}
-	}
-	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
-	prefetch(next);
-	prefetch_stack(next);
+	else {
+		prefetch(next);
+		prefetch_stack(next);
+	}
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
-	update_cpu_clock(prev, rq, now);
-
-	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0)
-		prev->sleep_avg = 0;
+	update_cpu_clock(prev, rq, now, 0);
 	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);
@@ -3844,29 +3597,22 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	struct prio_array *array;
 	unsigned long flags;
+	int queued, oldprio;
 	struct rq *rq;
-	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
 
 	oldprio = p->prio;
-	array = p->array;
-	if (array)
-		dequeue_task(p, array);
+	queued = task_queued(p);
+	if (queued)
+		dequeue_task(p, rq);
 	p->prio = prio;
 
-	if (array) {
-		/*
-		 * If changing to an RT priority then queue it
-		 * in the active array!
-		 */
-		if (rt_task(p))
-			array = rq->active;
-		enqueue_task(p, array);
+	if (queued) {
+		enqueue_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -3875,8 +3621,8 @@ void rt_mutex_setprio(struct task_struct
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
+		} else
+			try_preempt(p, rq);
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -3885,8 +3631,7 @@ void rt_mutex_setprio(struct task_struct
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	struct prio_array *array;
-	int old_prio, delta;
+	int queued, old_prio,delta;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3907,20 +3652,22 @@ void set_user_nice(struct task_struct *p
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	array = p->array;
-	if (array) {
-		dequeue_task(p, array);
+	queued = task_queued(p);
+	if (queued) {
+		dequeue_task(p, rq);
 		dec_raw_weighted_load(rq, p);
 	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
+	if (p->bonus > bonus(p))
+		p->bonus= bonus(p);
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 
-	if (array) {
-		enqueue_task(p, array);
+	if (queued) {
+		enqueue_task(p, rq);
 		inc_raw_weighted_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
@@ -4043,18 +3790,13 @@ static inline struct task_struct *find_p
 /* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
-	BUG_ON(p->array);
+	BUG_ON(task_queued(p));
 
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
-	/*
-	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-	 */
-	if (policy == SCHED_BATCH)
-		p->sleep_avg = 0;
 	set_load_weight(p);
 }
 
@@ -4069,8 +3811,7 @@ static void __setscheduler(struct task_s
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1;
-	struct prio_array *array;
+	int queued, retval, oldprio, oldpolicy = -1;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4144,12 +3885,12 @@ recheck:
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	array = p->array;
-	if (array)
+	queued = task_queued(p);
+	if (queued)
 		deactivate_task(p, rq);
 	oldprio = p->prio;
 	__setscheduler(p, policy, param->sched_priority);
-	if (array) {
+	if (queued) {
 		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
@@ -4159,8 +3900,8 @@ recheck:
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
+		} else
+			try_preempt(p, rq);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4432,41 +4173,23 @@ asmlinkage long sys_sched_getaffinity(pi
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
- * This function yields the current CPU by moving the calling thread
- * to the expired array. If there are no other threads running on this
- * CPU then this function will return.
+ * This function yields the current CPU by dropping the priority of current
+ * to the lowest priority.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
-	struct prio_array *array = current->array, *target = rq->expired;
+	int newprio = current->prio;
 
 	schedstat_inc(rq, yld_cnt);
-	/*
-	 * We implement yielding by moving the task into the expired
-	 * queue.
-	 *
-	 * (special rule: RT tasks will just roundrobin in the active
-	 *  array.)
-	 */
-	if (rt_task(current))
-		target = rq->active;
+	newprio = current->prio;
+	schedstat_inc(rq, yld_cnt);
+	current->slice = slice(current);
+	current->time_slice = rr_interval(current);
+	if (likely(!rt_task(current)))
+		newprio = MIN_USER_PRIO;
 
-	if (array->nr_active == 1) {
-		schedstat_inc(rq, yld_act_empty);
-		if (!rq->expired->nr_active)
-			schedstat_inc(rq, yld_both_empty);
-	} else if (!rq->expired->nr_active)
-		schedstat_inc(rq, yld_exp_empty);
-
-	if (array != target) {
-		dequeue_task(current, array);
-		enqueue_task(current, target);
-	} else
-		/*
-		 * requeue_task is cheaper so perform that if possible.
-		 */
-		requeue_task(current, array);
+	requeue_task(current, rq, newprio);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
@@ -4677,7 +4400,7 @@ long sys_sched_rr_get_interval(pid_t pid
 		goto out_unlock;
 
 	jiffies_to_timespec(p->policy == SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+				0 : slice(p), &t);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -4772,8 +4495,6 @@ void __cpuinit init_idle(struct task_str
 	unsigned long flags;
 
 	idle->timestamp = sched_clock();
-	idle->sleep_avg = 0;
-	idle->array = NULL;
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
@@ -4893,7 +4614,7 @@ static int __migrate_task(struct task_st
 		goto out;
 
 	set_task_cpu(p, dest_cpu);
-	if (p->array) {
+	if (task_queued(p)) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
 		 * The same thing could be achieved by doing this step
@@ -4904,8 +4625,7 @@ static int __migrate_task(struct task_st
 				+ rq_dest->most_recent_timestamp;
 		deactivate_task(p, rq_src);
 		__activate_task(p, rq_dest);
-		if (TASK_PREEMPTS_CURR(p, rq_dest))
-			resched_task(rq_dest->curr);
+		try_preempt(p, rq_dest);
 	}
 	ret = 1;
 out:
@@ -5134,7 +4854,7 @@ static void migrate_dead_tasks(unsigned 
 
 	for (arr = 0; arr < 2; arr++) {
 		for (i = 0; i < MAX_PRIO; i++) {
-			struct list_head *list = &rq->arrays[arr].queue[i];
+			struct list_head *list = &rq->queue[i];
 
 			while (!list_empty(list))
 				migrate_dead(dead_cpu, list_entry(list->next,
@@ -6725,19 +6445,16 @@ int in_sched_functions(unsigned long add
 
 void __init sched_init(void)
 {
-	int i, j, k;
+	int i;
 
 	for_each_possible_cpu(i) {
-		struct prio_array *array;
 		struct rq *rq;
+		int j;
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
-		rq->active = rq->arrays;
-		rq->expired = rq->arrays + 1;
-		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
@@ -6751,15 +6468,11 @@ void __init sched_init(void)
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
-		for (j = 0; j < 2; j++) {
-			array = rq->arrays + j;
-			for (k = 0; k < MAX_PRIO; k++) {
-				INIT_LIST_HEAD(array->queue + k);
-				__clear_bit(k, array->bitmap);
-			}
-			// delimiter for bitsearch
-			__set_bit(MAX_PRIO, array->bitmap);
-		}
+		for (j = 0; j < MAX_PRIO; j++)
+			INIT_LIST_HEAD(&rq->queue[j]);
+		memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO)*sizeof(long));
+		/* delimiter for bitsearch */
+		__set_bit(MAX_PRIO, rq->bitmap);
 	}
 
 	set_load_weight(&init_task);
@@ -6815,10 +6528,10 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
-	struct prio_array *array;
 	struct task_struct *p;
 	unsigned long flags;
 	struct rq *rq;
+	int queued;
 
 	read_lock_irq(&tasklist_lock);
 	for_each_process(p) {
@@ -6828,11 +6541,11 @@ void normalize_rt_tasks(void)
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 
-		array = p->array;
-		if (array)
+		queued = task_queued(p);
+		if (queued)
 			deactivate_task(p, task_rq(p));
 		__setscheduler(p, SCHED_NORMAL, 0);
-		if (array) {
+		if (queued) {
 			__activate_task(p, task_rq(p));
 			resched_task(rq->curr);
 		}
