Index: linux-2.6.21-rc4-mm1/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.21-rc4-mm1.orig/Documentation/sysctl/kernel.txt	2007-03-21 20:53:50.000000000 +1100
+++ linux-2.6.21-rc4-mm1/Documentation/sysctl/kernel.txt	2007-03-22 11:41:54.000000000 +1100
@@ -43,6 +43,7 @@ show up in /proc/sys/kernel:
 - printk
 - real-root-dev               ==> Documentation/initrd.txt
 - reboot-cmd                  [ SPARC only ]
+- rr_interval
 - rtsig-max
 - rtsig-nr
 - sem
@@ -288,6 +289,17 @@ rebooting. ???
 
 ==============================================================
 
+rr_interval:
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. This value is in _ticks_ and the default value chosen depends
+on the number of cpus available at scheduler initialisation. Valid
+values are from 1-100.
+
+==============================================================
+
 rtsig-max & rtsig-nr:
 
 The file rtsig-max can be used to tune the maximum number
Index: linux-2.6.21-rc4-mm1/kernel/sched.c
===================================================================
--- linux-2.6.21-rc4-mm1.orig/kernel/sched.c	2007-03-21 20:53:50.000000000 +1100
+++ linux-2.6.21-rc4-mm1/kernel/sched.c	2007-03-26 09:46:40.000000000 +1000
@@ -16,7 +16,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2007-03-02	Rotating Staircase deadline scheduling policy by Con Kolivas
+ *  2007-03-02	Staircase deadline scheduling policy by Con Kolivas
  */
 
 #include <linux/mm.h>
@@ -88,13 +88,24 @@ unsigned long long __attribute__((weak))
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
 
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define JIFFY_NS		JIFFIES_TO_NS(1)
+#define NS_TO_MS(TIME)		((TIME) / 1000000)
+#define MS_TO_NS(TIME)		((TIME) * 1000000)
+
 #define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
 /*
  * This is the time all tasks within the same priority round robin.
- * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ.
+ * Value is in ms and set to a minimum of 8ms. Scales with number of cpus.
+ * Tunable via /proc interface.
  */
-static unsigned int rr_interval __read_mostly;
+int rr_interval __read_mostly;
+
 #define RR_INTERVAL		8
 #define DEF_TIMESLICE		(rr_interval * 20)
 
@@ -146,8 +157,10 @@ struct prio_array {
 
 	DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
 	/*
-	 * The bitmap of priorities queued; The dynamic bits can have
-	 * false positives. Include 1 bit for delimiter.
+	 * The bitmap of priorities queued for this array. While the expired
+	 * array will never have realtime tasks on it, it is simpler to have
+	 * equal sized bitmaps for a cheap array swap. Include 1 bit for
+	 * delimiter.
 	 */
 };
 
@@ -190,17 +203,14 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	long prio_quota[PRIO_RANGE];
-	/*
-	 * The quota of ticks the runqueue runs at each dynamic priority
-	 * before cycling to the next priority.
-	 */
-
 	struct prio_array *active, *expired, arrays[2];
 	unsigned long *dyn_bitmap, *exp_bitmap;
 
-	int prio_level;
-	/* The current dynamic priority level this runqueue is at */
+	int prio_level, best_static_prio;
+	/*
+	 * The current dynamic priority level this runqueue is at, and the
+	 * best static priority queued this major rotation.
+	 */
 
 	unsigned long prio_rotation;
 	/* How many times we have rotated the priority queue */
@@ -648,26 +658,13 @@ static inline int task_queued(struct tas
 	return !list_empty(&task->run_list);
 }
 
-static inline void set_task_entitlement(struct task_struct *p)
-{
-	__set_bit(USER_PRIO(p->prio), p->bitmap);
-	p->time_slice = p->quota;
-}
-
-/*
- * There is no specific hard accounting. The dynamic bits can have
- * false positives. rt_tasks can only be on the active queue.
- */
 static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
 {
 	__set_bit(p->prio, p->array->prio_bitmap);
 }
 
 /*
- * Removing from a runqueue. While we don't know with absolute certainty
- * where this task really is, the p->array and p->prio are very likely
- * so we check that queue to see if we can clear that bit to take some
- * load off finding false positives in next_dynamic_task().
+ * Removing from a runqueue.
  */
 static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
@@ -684,21 +681,36 @@ static inline void task_new_array(struct
 {
 	bitmap_zero(p->bitmap, PRIO_RANGE);
 	p->rotation = rq->prio_rotation;
+	p->time_slice = p->quota;
 }
 
+/* Find the first slot from the relevant prio_matrix entry */
 static inline int first_prio_slot(struct task_struct *p)
 {
 	return SCHED_PRIO(find_first_zero_bit(
 		prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
 }
 
-static inline int next_prio_slot(struct task_struct *p, int prio)
+/*
+ * Find the first unused slot by this task that is also in its prio_matrix
+ * level. Ensure that the prio_level is not unnecessarily low by checking
+ * that best_static_prio this major rotation was not a niced task.
+ * SCHED_BATCH tasks do not perform this check so they do not induce
+ * latencies in tasks of any nice level.
+ */
+static inline int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
 	DECLARE_BITMAP(tmp, PRIO_RANGE);
+	int search_prio;
+
+	if (p->static_prio < rq->best_static_prio && p->policy != SCHED_BATCH)
+		search_prio = MAX_RT_PRIO;
+	else
+		search_prio = rq->prio_level;
 	bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)],
 		  PRIO_RANGE);
 	return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
-		USER_PRIO(prio)));
+		USER_PRIO(search_prio)));
 }
 
 static void queue_expired(struct task_struct *p, struct rq *rq)
@@ -707,41 +719,30 @@ static void queue_expired(struct task_st
 	task_new_array(p, rq);
 	p->prio = p->normal_prio = first_prio_slot(p);
 	p->time_slice = p->quota;
+	p->rotation = rq->prio_rotation;
 }
 
-#define rq_quota(rq, prio)	((rq)->prio_quota[USER_PRIO(prio)])
-
 /*
- * recalc_task_prio determines what prio a non rt_task will be
+ * recalc_task_prio determines what priority a non rt_task will be
  * queued at. If the task has already been running during this runqueue's
  * major rotation (rq->prio_rotation) then it continues at the same
  * priority if it has tick entitlement left. If it does not have entitlement
  * left, it finds the next priority slot according to its nice value that it
  * has not extracted quota from. If it has not run during this major
- * rotation, it starts at its static priority and has its bitmap quota
+ * rotation, it starts at the next_entitled_slot and has its bitmap quota
  * cleared. If it does not have any slots left it has all its slots reset and
- * is queued on the expired at its static priority.
+ * is queued on the expired at its first_prio_slot.
  */
 static void recalc_task_prio(struct task_struct *p, struct rq *rq)
 {
 	struct prio_array *array = rq->active;
-	int queue_prio, search_prio = MAX_RT_PRIO;
-
-	/*
-	 * SCHED_BATCH tasks never start at better priority than any other
-	 * task that is already running since they are flagged as latency
-	 * insensitive. This means they never cause greater latencies in other
-	 * non SCHED_BATCH tasks of the same nice level, but they still will
-	 * not be exposed to high latencies themselves.
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		search_prio = rq->prio_level;
+	int queue_prio;
 
 	if (p->rotation == rq->prio_rotation) {
 		if (p->array == array) {
-			if (p->time_slice && rq_quota(rq, p->prio))
+			if (p->time_slice > 0)
 				return;
-			search_prio = p->prio;
+			p->time_slice = p->quota;
 		} else if (p->array == rq->expired) {
 			queue_expired(p, rq);
 			return;
@@ -750,22 +751,19 @@ static void recalc_task_prio(struct task
 	} else
 		task_new_array(p, rq);
 
-	queue_prio = next_prio_slot(p, search_prio);
+	queue_prio = next_entitled_slot(p, rq);
 	if (queue_prio >= MAX_PRIO) {
 		queue_expired(p, rq);
 		return;
 	}
-	rq_quota(rq, queue_prio) += p->quota;
 	p->prio = p->normal_prio = queue_prio;
 	p->array = array;
-	set_task_entitlement(p);
+	__set_bit(USER_PRIO(p->prio), p->bitmap);
 }
 
 /*
  * Adding to a runqueue. The dynamic priority queue that it is added to is
- * determined by the priority rotation of the runqueue it is being added to
- * and the quota still available in the task in p->bitmap and p->time_slice
- * (see recalc_task_prio above).
+ * determined by recalc_task_prio() above.
  */
 static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
 {
@@ -802,7 +800,7 @@ static void requeue_task(struct task_str
 	list_move_tail(&p->run_list, p->array->queue + p->prio);
 	if (!rt_task(p)) {
 		if (list_empty(old_array->queue + old_prio))
-			__clear_bit(old_prio, p->array->prio_bitmap);
+			__clear_bit(old_prio, old_array->prio_bitmap);
 		set_dynamic_bit(p, rq);
 	}
 }
@@ -820,13 +818,14 @@ static void requeue_task(struct task_str
  * task_timeslice - the total duration a task can run during one major
  * rotation.
  */
-static inline unsigned int task_timeslice(struct task_struct *p)
+static inline int task_timeslice(struct task_struct *p)
 {
-	unsigned int slice, rr;
+	int slice, rr;
 
 	slice = rr = p->quota;
 	if (!rt_task(p))
 		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
+	slice = NS_TO_JIFFIES(slice) ? : 1;
 	return slice;
 }
 
@@ -907,7 +906,7 @@ static inline int normal_prio(struct tas
 	if (has_rt_policy(p))
 		return MAX_RT_PRIO-1 - p->rt_priority;
 	/* Other tasks all have normal_prio set in recalc_task_prio */
-	if (likely(p->prio >= MAX_RT_PRIO))
+	if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO))
 		return p->prio;
 	else
 		return p->static_prio;
@@ -933,22 +932,24 @@ static int effective_prio(struct task_st
 }
 
 /*
- * All tasks have quotas based on rr_interval. From nice 0 to 19 they are
- * all equal to it and below zero they get exponentially larger making their
- * effective quota significantly larger. rt tasks all get rr_interval.
- * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval
- * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0
- * similar to the ratios between 0 and +19.
+ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval.
+ * From nice 1 to 19 they are smaller than it only if they are at least one
+ * tick still. Below nice 0 they get progressively larger.
+ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
+ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
  */
 static unsigned int rr_quota(struct task_struct *p)
 {
-	int neg_nice = -TASK_NICE(p), rr = rr_interval;
+	int nice = TASK_NICE(p), rr = rr_interval;
 
-	if (neg_nice > 6 && !rt_task(p)) {
-		rr *= neg_nice * neg_nice;
-		rr /= 40;
+	if (!rt_task(p)) {
+		if (nice < -6) {
+			rr *= nice * nice;
+			rr /= 40;
+		} else if (nice > 0 && (rr * HZ / 1000 / 2) > 0)
+			rr /= 2;
 	}
-	return rr;
+	return MS_TO_NS(rr);
 }
 
 /*
@@ -1630,7 +1631,9 @@ void fastcall sched_fork(struct task_str
 	 * resulting in more scheduling fairness.
 	 */
 	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
+	if (unlikely(p->time_slice < 2))
+		p->time_slice = 2;
+	p->time_slice = current->time_slice >> 1;
 	/*
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
@@ -1638,15 +1641,6 @@ void fastcall sched_fork(struct task_str
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
-	if (!current->time_slice) {
-		/*
-		 * This case happens when the parent has only a single jiffy
-		 * left from its timeslice. Taking the runqueue lock is not
-		 * a problem.
-		 */
-		current->time_slice = 1;
-		task_running_tick(cpu_rq(cpu), current);
-	}
 	local_irq_enable();
 out:
 	put_cpu();
@@ -1720,14 +1714,16 @@ void fastcall wake_up_new_task(struct ta
  */
 void fastcall sched_exit(struct task_struct *p)
 {
+	struct task_struct *parent;
 	unsigned long flags;
 	struct rq *rq;
 
-	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > p->quota))
-			p->parent->time_slice = p->quota;
+	parent = p->parent;
+	rq = task_rq_lock(parent, &flags);
+	if (p->first_time_slice && task_cpu(p) == task_cpu(parent)) {
+		parent->time_slice += p->time_slice;
+		if (unlikely(parent->time_slice > parent->quota))
+			parent->time_slice = parent->quota;
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -2057,25 +2053,55 @@ void sched_exec(void)
 }
 
 /*
+ * This is a unique version of enqueue_task for the SMP case where a task
+ * has just been moved across runqueues. It uses the information from the
+ * old runqueue to help it make a decision much like recalc_task_prio. As
+ * the new runqueue is almost certainly at a different prio_level than the
+ * src_rq it is cheapest just to pick the next entitled slot.
+ */
+static inline void enqueue_pulled_task(struct rq *src_rq, struct rq *rq,
+				       struct task_struct *p)
+{
+	int queue_prio;
+
+	p->array = rq->active;
+	if (!rt_task(p)) {
+		if (p->rotation == src_rq->prio_rotation) {
+			if (p->array == src_rq->expired) {
+				queue_expired(p, rq);
+				goto out_queue;
+			}
+			if (p->time_slice < 0)
+				task_new_array(p, rq);
+		} else
+			task_new_array(p, rq);
+	}
+	queue_prio = next_entitled_slot(p, rq);
+	if (queue_prio >= MAX_PRIO) {
+		queue_expired(p, rq);
+		goto out_queue;
+	}
+	p->prio = queue_prio;
+out_queue:
+	p->normal_prio = p->prio;
+	p->rotation = rq->prio_rotation;
+	sched_info_queued(p);
+	set_dynamic_bit(p, rq);
+	list_add_tail(&p->run_list, p->array->queue + p->prio);
+}
+
+/*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
-		      struct task_struct *p, struct rq *this_rq,
-		      int this_cpu)
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+		      struct rq *this_rq, int this_cpu)
 {
 	dequeue_task(p, src_rq);
 	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
-
-	/*
-	 * If this task has already been running on src_rq this priority
-	 * cycle, make the new runqueue think it has been on its cycle
-	 */
-	if (p->rotation == src_rq->prio_rotation)
-		p->rotation = this_rq->prio_rotation;
-	enqueue_task(p, this_rq);
+	enqueue_pulled_task(src_rq, this_rq, p);
 	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
 				+ this_rq->most_recent_timestamp;
 	try_preempt(p, this_rq);
@@ -2220,7 +2246,7 @@ skip_queue:
 		goto skip_bitmap;
 	}
 
-	pull_task(busiest, array, tmp, this_rq, this_cpu);
+	pull_task(busiest, tmp, this_rq, this_cpu);
 	pulled++;
 	rem_load_move -= tmp->load_weight;
 
@@ -3181,8 +3207,63 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
-	p->last_ran = rq->most_recent_timestamp = now;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t time_diff;
+
+	/* Sanity check. It should never go backwards or ruin accounting */
+	if (unlikely(now < p->last_ran))
+		goto out_set;
+	/* All the userspace visible cpu accounting is done here */
+	time_diff = now - p->last_ran;
+	p->sched_time += time_diff;
+	if (p != rq->idle) {
+		cputime_t utime_diff = time_diff;
+
+		if (TASK_NICE(p) > 0) {
+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+							 time_diff);
+			if (cpustat->nice_ns > JIFFY_NS) {
+				cpustat->nice_ns =
+					cputime64_sub(cpustat->nice_ns,
+					JIFFY_NS);
+				cpustat->nice =
+					cputime64_add(cpustat->nice, 1);
+			}
+		} else {
+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
+							 time_diff);
+			if (cpustat->user_ns > JIFFY_NS) {
+				cpustat->user_ns =
+					cputime64_sub(cpustat->user_ns,
+					JIFFY_NS);
+				cpustat ->user =
+					cputime64_add(cpustat->user, 1);
+			}
+		}
+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+		if (p->utime_ns > JIFFY_NS) {
+			p->utime_ns = cputime_sub(p->utime_ns, JIFFY_NS);
+			p->utime = cputime_add(p->utime,
+					       jiffies_to_cputime(1));
+		}
+		/* cpu scheduler quota accounting is performed here */
+		if (p->policy != SCHED_FIFO)
+			p->time_slice -= time_diff;
+
+	} else {
+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+		if (cpustat->idle_ns > JIFFY_NS) {
+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+							 JIFFY_NS);
+			cpustat->idle = cputime64_add(cpustat->idle, 1);
+		}
+	}
+out_set:
+	/*
+	 * We still need to set these values even if the clock appeared to
+	 * go backwards in case _this_ is the correct timestamp.
+	 */
+	rq->most_recent_timestamp = p->last_ran = now;
 }
 
 /*
@@ -3247,8 +3328,6 @@ void account_system_time(struct task_str
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
@@ -3283,7 +3362,6 @@ static void task_expired_entitlement(str
 	struct prio_array *old_array;
 	int old_prio;
 
-	set_tsk_need_resched(p);
 	if (unlikely(p->first_time_slice))
 		p->first_time_slice = 0;
 	if (rt_task(p)) {
@@ -3297,114 +3375,24 @@ static void task_expired_entitlement(str
 	requeue_task(p, rq, old_array, old_prio);
 }
 
-/*
- * A major priority rotation occurs when all priority quotas for this array
- * have been exhausted.
- */
-static inline void major_prio_rotation(struct rq *rq)
-{
-	struct prio_array *new_array = rq->expired;
-
-	rq->expired = rq->active;
-	rq->active = new_array;
-	rq->exp_bitmap = rq->expired->prio_bitmap;
-	rq->dyn_bitmap = rq->active->prio_bitmap;
-	rq->prio_rotation++;
-}
-
-/*
- * This is the heart of the virtual deadline priority management.
- *
- * We have used up the quota allocated to this priority level so we rotate
- * the prio_level of the runqueue to the next lowest priority. We merge any
- * remaining tasks at this level current_queue with the next priority and
- * reset this level's queue. MAX_PRIO - 1 is a special case where we perform
- * a major rotation.
- */
-static inline void rotate_runqueue_priority(struct rq *rq)
-{
-	int new_prio_level;
-	struct prio_array *array;
-
-	/*
-	 * Make sure we don't have tasks still on the active array that
-	 * haven't run due to not preempting a lower priority task. This can
-	 * happen on list merging or smp balancing.
-	 */
-	if (unlikely(sched_find_first_bit(rq->dyn_bitmap) < rq->prio_level))
-		return;
-
-	array = rq->active;
-	if (rq->prio_level > MAX_PRIO - 2) {
-		/* Major rotation required */
-		struct prio_array *new_queue = rq->expired;
-
-		/*
-		 * On a major rotation we move everything remaining to best
-		 * priority on the new array. The priority matrix bitmap will
-		 * ensure tasks only get the slots each static priority
-		 * deserves.
-		 */
-		new_prio_level = MAX_RT_PRIO;
-		if (!list_empty(array->queue + rq->prio_level)) {
-			list_splice_tail_init(array->queue + rq->prio_level,
-					 new_queue->queue + new_prio_level);
-		}
-		memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota));
-		major_prio_rotation(rq);
-	} else {
-		/* Minor rotation */
-		new_prio_level = rq->prio_level + 1;
-		__clear_bit(rq->prio_level, rq->dyn_bitmap);
-		if (!list_empty(array->queue + rq->prio_level)) {
-			list_splice_tail_init(array->queue + rq->prio_level,
-					 array->queue + new_prio_level);
-			__set_bit(new_prio_level, rq->dyn_bitmap);
-		}
-		rq_quota(rq, rq->prio_level) = 0;
-	}
-	rq->prio_level = new_prio_level;
-	/*
-	 * As we are merging to a prio_level that may not have anything in
-	 * its quota we add 1 to ensure the tasks get to run in schedule() to
-	 * add their quota to it.
-	 */
-	rq_quota(rq, new_prio_level) += 1;
-}
-
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
-	if (unlikely(!task_queued(p))) {
-		/* Task has expired but was not scheduled yet */
-		set_tsk_need_resched(p);
-		return;
-	}
 	/* SCHED_FIFO tasks never run out of timeslice. */
 	if (unlikely(p->policy == SCHED_FIFO))
 		return;
 
+	if (p->time_slice > 0)
+		return;
 	spin_lock(&rq->lock);
-	/*
-	 * Accounting is performed by both the task and the runqueue. This
-	 * allows frequently sleeping tasks to get their proper quota of
-	 * cpu as the runqueue will have their quota still available at
-	 * the appropriate priority level. It also means frequently waking
-	 * tasks that might miss the scheduler_tick() will get forced down
-	 * priority regardless.
-	 */
-	if (!--p->time_slice)
-		task_expired_entitlement(rq, p);
-	/*
-	 * We only employ the deadline mechanism if we run over the quota.
-	 * It allows aliasing problems around the scheduler_tick to be
-	 * less harmful.
-	 */
-	if (!rt_task(p) && --rq_quota(rq, rq->prio_level) < 0) {
-		if (unlikely(p->first_time_slice))
-			p->first_time_slice = 0;
-		rotate_runqueue_priority(rq);
+	if (unlikely(!task_queued(p))) {
+		/* Task has expired but was not scheduled off yet */
 		set_tsk_need_resched(p);
+		goto out_unlock;
 	}
+	/* p->time_slice <= 0 */
+	task_expired_entitlement(rq, p);
+	set_tsk_need_resched(p);
+out_unlock:
 	spin_unlock(&rq->lock);
 }
 
@@ -3469,87 +3457,45 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
-/* Is a dynamic_prio part of the allocated slots for this static_prio */
-static inline int entitled_slot(int static_prio, int dynamic_prio)
-{
-	return !test_bit(USER_PRIO(dynamic_prio),
-		prio_matrix[USER_PRIO(static_prio)]);
-}
-
 /*
- * If a task is queued at a priority that isn't from its bitmap we exchange
- * by setting one of the entitlement bits.
- */
-static inline void exchange_slot(struct task_struct *p, int prio)
-{
-	int slot = next_prio_slot(p, prio);
-
-	if (slot < MAX_PRIO)
-		__set_bit(USER_PRIO(slot), p->bitmap);
-}
-
-/*
- * next_dynamic_task finds the next suitable dynamic task. As the dyn_bitmap
- * contains all the active and expired dynamic tasks sequentially we only
- * need to do one bitmap lookup.
+ * next_dynamic_task finds the next suitable dynamic task.
  */
 static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
 {
 	struct task_struct *next;
 	struct list_head *queue;
 	struct prio_array *array = rq->active;
-	int expirations = 0;
 
 retry:
 	if (idx >= MAX_PRIO) {
-		BUG_ON(++expirations > 1);
-		/*
-		 * We have selected a bit from the expired range so there are
-		 * no more tasks in the active array.
-		 */
-		major_prio_rotation(rq);
-		array = rq->active;
+		/* There are no more tasks in the active array. Swap arrays */
+		array = rq->expired;
+		rq->expired = rq->active;
+		rq->active = array;
+		rq->exp_bitmap = rq->expired->prio_bitmap;
+		rq->dyn_bitmap = rq->active->prio_bitmap;
+		rq->best_static_prio = MAX_PRIO - 1;
+		rq->prio_rotation++;
 		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
 	}
-	if (unlikely(list_empty(array->queue + idx))) {
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+	if (unlikely(next->time_slice < 0)) {
 		/*
-		 * This can happen because they are not always cleared on
-		 * dequeue_task since they may have been dequeued while
-		 * waiting on a runqueue and a rotation has occurred in the
-		 * interim. A very rare occurrence.
+		 * Unlucky enough that this task ran out of time_slice
+		 * before it hit a scheduler_tick so it should have its
+		 * priority reassessed and choose another task (possibly
+		 * the same one)
 		 */
-		__clear_bit(idx, rq->dyn_bitmap);
-		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1);
+		task_expired_entitlement(rq, next);
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
 		goto retry;
 	}
-	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
-	/*
-	 * When the task is chosen it is checked to see if its quota has been
-	 * added to this runqueue level which is only performed once per
-	 * level per major rotation for each running task.
-	 */
-	if (next->rotation != rq->prio_rotation) {
-			/* Task has moved during major rotation */
-			task_new_array(next, rq);
-			if (!entitled_slot(next->static_prio, idx))
-				exchange_slot(next, idx);
-			set_task_entitlement(next);
-			rq_quota(rq, idx) += next->quota;
-	} else if (!test_bit(USER_PRIO(idx), next->bitmap)) {
-			/* Task has moved during minor rotation */
-			if (!entitled_slot(next->static_prio, idx))
-				exchange_slot(next, idx);
-			set_task_entitlement(next);
-			rq_quota(rq, idx) += next->quota;
-	}
 	rq->prio_level = idx;
-	/*
-	 * next needs to have its prio and array reset here in case the
-	 * values are wrong due to priority rotation.
-	 */
-	next->prio = idx;
-	next->array = array;
+	next->rotation = rq->prio_rotation;
+	if (next->static_prio < rq->best_static_prio &&
+	    next->policy != SCHED_BATCH)
+		rq->best_static_prio = next->static_prio;
 	return next;
 }
 
@@ -3632,8 +3578,12 @@ need_resched_nonpreemptible:
 		next = list_entry(queue->next, struct task_struct, run_list);
 	}
 switch_tasks:
-	if (next == rq->idle)
+	if (next == rq->idle) {
+		rq->best_static_prio = MAX_PRIO - 1;
+		rq->prio_level = MAX_RT_PRIO;
+		rq->prio_rotation++;
 		schedstat_inc(rq, sched_goidle);
+	}
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
@@ -4648,8 +4598,9 @@ asmlinkage long sys_sched_getaffinity(pi
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU by moving the calling thread
- * to the end of its current priority queue. If there are no other
- * threads running on this cpu this function will return.
+ * to the expired array if SCHED_NORMAL or the end of its current priority
+ * queue if a realtime task. If there are no other threads running on this
+ * cpu this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
@@ -4659,8 +4610,15 @@ asmlinkage long sys_sched_yield(void)
 	schedstat_inc(rq, yld_cnt);
 	if (rq->nr_running == 1)
 		schedstat_inc(rq, yld_both_empty);
-	else
-		list_move_tail(&p->run_list, p->array->queue + p->prio);
+	else {
+		struct prio_array *old_array = p->array;
+		int old_prio = p->prio;
+
+		/* p->prio will be updated in requeue_task via queue_expired */
+		if (!rt_task(p))
+			p->array = rq->expired;
+		requeue_task(p, rq, old_array, old_prio);
+	}
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
@@ -4999,9 +4957,9 @@ void __cpuinit init_idle(struct task_str
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	bitmap_zero(idle->bitmap, PRIO_RANGE + 1);
-	idle->timestamp = sched_clock();
-	idle->array = NULL;
+	bitmap_zero(idle->bitmap, PRIO_RANGE);
+	idle->timestamp = idle->last_ran = sched_clock();
+	idle->array = rq->active;
 	idle->prio = idle->normal_prio = NICE_TO_PRIO(0);
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
@@ -7083,6 +7041,7 @@ void __init sched_init(void)
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		rq->prio_rotation = 0;
+		rq->best_static_prio = MAX_PRIO - 1;
 		rq->prio_level = MAX_RT_PRIO;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
@@ -7110,15 +7069,13 @@ void __init sched_init(void)
 			/* delimiter for bitsearch */
 			__set_bit(MAX_PRIO, array->prio_bitmap);
 		}
-		for (k = 0; k < PRIO_RANGE; k++)
-			rq->prio_quota[k] = 0;
 		highest_cpu = i;
 
 		/* Every added cpu increases the rr_interval */
 		rr_us += rr_inc;
 		rr_inc /= 2;
 	}
-	rr_interval = rr_us / 1000 ? : 1;
+	rr_interval = rr_us / 1000;
 
 	set_load_weight(&init_task);
 
Index: linux-2.6.21-rc4-mm1/kernel/sysctl.c
===================================================================
--- linux-2.6.21-rc4-mm1.orig/kernel/sysctl.c	2007-03-21 20:53:50.000000000 +1100
+++ linux-2.6.21-rc4-mm1/kernel/sysctl.c	2007-03-22 11:41:54.000000000 +1100
@@ -79,6 +79,7 @@ extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
 extern int print_fatal_signals;
+extern int rr_interval;
 
 #if defined(CONFIG_ADAPTIVE_READAHEAD)
 extern int readahead_ratio;
@@ -167,6 +168,13 @@ int sysctl_legacy_va_layout;
 #endif
 
 
+/* Constants for minimum and maximum testing in vm_table.
+   We use these as one-element integer vectors. */
+static int  __read_mostly zero;
+static int  __read_mostly one = 1;
+static int  __read_mostly one_hundred = 100;
+
+
 /* The default sysctl tables: */
 
 static ctl_table root_table[] = {
@@ -515,6 +523,17 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "rr_interval",
+		.data		= &rr_interval,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &one_hundred,
+	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
@@ -631,12 +650,6 @@ static ctl_table kern_table[] = {
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
Index: linux-2.6.21-rc4-mm1/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.21-rc4-mm1.orig/include/linux/kernel_stat.h	2007-03-26 09:04:10.000000000 +1000
+++ linux-2.6.21-rc4-mm1/include/linux/kernel_stat.h	2007-03-26 09:04:54.000000000 +1000
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
 	cputime64_t user;
+	cputime64_t user_ns;
 	cputime64_t nice;
+	cputime64_t nice_ns;
 	cputime64_t system;
 	cputime64_t softirq;
 	cputime64_t irq;
 	cputime64_t idle;
+	cputime64_t idle_ns;
 	cputime64_t iowait;
 	cputime64_t steal;
 };
Index: linux-2.6.21-rc4-mm1/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc4-mm1.orig/include/linux/sched.h	2007-03-26 09:04:10.000000000 +1000
+++ linux-2.6.21-rc4-mm1/include/linux/sched.h	2007-03-26 09:36:43.000000000 +1000
@@ -853,18 +853,15 @@ struct task_struct {
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice;
+	int time_slice;
 	/*
 	 * How much this task is entitled to run at the current priority
 	 * before being requeued at a lower priority.
 	 */
+	int quota;
+	/* How much this task receives at each priority level */
 	unsigned int first_time_slice;
 	/* Is this the very first time_slice this task has ever run. */
-	unsigned int quota;
-	/*
-	 * How much this task contributes to the current priority queue
-	 * length
-	 */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
@@ -911,7 +908,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned int rt_priority;
-	cputime_t utime, stime;
+	cputime_t utime, utime_ns, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
Index: linux-2.6.21-rc4-mm1/kernel/timer.c
===================================================================
--- linux-2.6.21-rc4-mm1.orig/kernel/timer.c	2007-03-26 09:04:10.000000000 +1000
+++ linux-2.6.21-rc4-mm1/kernel/timer.c	2007-03-26 09:04:54.000000000 +1000
@@ -1196,10 +1196,9 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick)
-		account_user_time(p, jiffies_to_cputime(1));
-	else
+	if (!user_tick)
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	/* User time is accounted for in update_cpu_clock in sched.c */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
Index: linux-2.6.21-rc4-mm1/Documentation/cpu-load.txt
===================================================================
--- linux-2.6.21-rc4-mm1.orig/Documentation/cpu-load.txt	2007-03-26 09:04:10.000000000 +1000
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,113 +0,0 @@
-CPU load
---------
-
-Linux exports various bits of information via `/proc/stat' and
-`/proc/uptime' that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example:
-
-    $ iostat
-    Linux 2.6.18.3-exp (linmac)     02/20/2007
-
-    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
-              10.01    0.00    2.92    5.44    0.00   81.63
-
-    ...
-
-Here the system thinks that over the default sampling period the
-system spent 10.01% of the time doing work in user space, 2.92% in the
-kernel, and was overall 81.63% of the time idle.
-
-In most cases the `/proc/stat' information reflects the reality quite
-closely, however due to the nature of how/when the kernel collects
-this data sometimes it can not be trusted at all.
-
-So how is this information collected?  Whenever timer interrupt is
-signalled the kernel looks what kind of task was running at this
-moment and increments the counter that corresponds to this tasks
-kind/state.  The problem with this is that the system could have
-switched between various states multiple times between two timer
-interrupts yet the counter is incremented only for the last state.
-
-
-Example
--------
-
-If we imagine the system with one task that periodically burns cycles
-in the following manner:
-
- time line between two timer interrupts
-|--------------------------------------|
- ^                                    ^
- |_ something begins working          |
-                                      |_ something goes to sleep
-                                     (only to be awaken quite soon)
-
-In the above situation the system will be 0% loaded according to the
-`/proc/stat' (since the timer interrupt will always happen when the
-system is executing the idle handler), but in reality the load is
-closer to 99%.
-
-One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside `/proc/stat'.
-
-
-/* gcc -o hog smallhog.c */
-#include <time.h>
-#include <limits.h>
-#include <signal.h>
-#include <sys/time.h>
-#define HIST 10
-
-static volatile sig_atomic_t stop;
-
-static void sighandler (int signr)
-{
-     (void) signr;
-     stop = 1;
-}
-static unsigned long hog (unsigned long niters)
-{
-     stop = 0;
-     while (!stop && --niters);
-     return niters;
-}
-int main (void)
-{
-     int i;
-     struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
-                             .it_value = { .tv_sec = 0, .tv_usec = 1 } };
-     sigset_t set;
-     unsigned long v[HIST];
-     double tmp = 0.0;
-     unsigned long n;
-     signal (SIGALRM, &sighandler);
-     setitimer (ITIMER_REAL, &it, NULL);
-
-     hog (ULONG_MAX);
-     for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
-     for (i = 0; i < HIST; ++i) tmp += v[i];
-     tmp /= HIST;
-     n = tmp - (tmp / 3.0);
-
-     sigemptyset (&set);
-     sigaddset (&set, SIGALRM);
-
-     for (;;) {
-         hog (n);
-         sigwait (&set, &i);
-     }
-     return 0;
-}
-
-
-References
-----------
-
-http://lkml.org/lkml/2007/2/12/6
-Documentation/filesystems/proc.txt (1.8)
-
-
-Thanks
-------
-
-Con Kolivas, Pavel Machek
Index: linux-2.6.21-rc4-mm1/include/linux/init_task.h
===================================================================
--- linux-2.6.21-rc4-mm1.orig/include/linux/init_task.h	2007-03-26 09:30:22.000000000 +1000
+++ linux-2.6.21-rc4-mm1/include/linux/init_task.h	2007-03-26 09:36:43.000000000 +1000
@@ -131,8 +131,8 @@ extern struct group_info init_groups;
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
 	.ioprio		= 0,						\
-	.time_slice	= HZ,						\
-	.quota		= HZ,						\
+	.time_slice	= 1000000000,					\
+	.quota		= 1000000000,					\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
