---
 Documentation/sysctl/kernel.txt |   12 +++
 kernel/sched.c                  |  123 ++++++++++++++++++++++++----------------
 kernel/sysctl.c                 |   25 ++++++--
 3 files changed, 105 insertions(+), 55 deletions(-)

Index: linux-2.6.20.3-rsdl/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.20.3-rsdl.orig/Documentation/sysctl/kernel.txt	2007-02-05 22:51:59.000000000 +1100
+++ linux-2.6.20.3-rsdl/Documentation/sysctl/kernel.txt	2007-03-22 13:19:15.000000000 +1100
@@ -43,6 +43,7 @@ show up in /proc/sys/kernel:
 - printk
 - real-root-dev               ==> Documentation/initrd.txt
 - reboot-cmd                  [ SPARC only ]
+- rr_interval
 - rtsig-max
 - rtsig-nr
 - sem
@@ -288,6 +289,17 @@ rebooting. ???
 
 ==============================================================
 
+rr_interval:
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. This value is in _ticks_ and the default value chosen depends
+on the number of cpus available at scheduler initialisation. Valid
+values are from 1-100.
+
+==============================================================
+
 rtsig-max & rtsig-nr:
 
 The file rtsig-max can be used to tune the maximum number
Index: linux-2.6.20.3-rsdl/kernel/sched.c
===================================================================
--- linux-2.6.20.3-rsdl.orig/kernel/sched.c	2007-03-17 00:03:17.000000000 +1100
+++ linux-2.6.20.3-rsdl/kernel/sched.c	2007-03-22 13:20:17.000000000 +1100
@@ -17,7 +17,7 @@
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-03-02	Rotating Staircase deadline scheduling policy by Con Kolivas
- *		RSDL v0.31
+ *		RSDL v0.32
  */
 
 #include <linux/mm.h>
@@ -82,8 +82,10 @@
 /*
  * This is the time all tasks within the same priority round robin.
  * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ.
+ * Tunable via /proc interface.
  */
-static unsigned int rr_interval __read_mostly;
+int rr_interval __read_mostly;
+
 #define RR_INTERVAL		8
 #define DEF_TIMESLICE		(rr_interval * 20)
 
@@ -160,8 +162,11 @@ struct rq {
 	struct prio_array *active, *expired, arrays[2];
 	unsigned long *dyn_bitmap, *exp_bitmap;
 
-	int prio_level;
-	/* The current dynamic priority level this runqueue is at */
+	int prio_level, best_static_prio;
+	/*
+	 * The current dynamic priority level this runqueue is at, and the
+	 * best static priority queued this major rotation.
+	 */
 
 	unsigned long prio_rotation;
 	/* How many times we have rotated the priority queue */
@@ -646,19 +651,40 @@ static inline void task_new_array(struct
 	p->rotation = rq->prio_rotation;
 }
 
+/* Find the first slot from the relevant prio_matrix entry */
 static inline int first_prio_slot(struct task_struct *p)
 {
 	return SCHED_PRIO(find_first_zero_bit(
 		prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
 }
 
-static inline int next_prio_slot(struct task_struct *p, int prio)
+/* Is a dynamic_prio part of the allocated slots for this static_prio */
+static inline int entitled_slot(int static_prio, int dynamic_prio)
+{
+	return !test_bit(USER_PRIO(dynamic_prio),
+		prio_matrix[USER_PRIO(static_prio)]);
+}
+
+/*
+ * Find the first unused slot by this task that is also in its prio_matrix
+ * level. Ensure that the prio_level is not unnecessarily low by checking
+ * that best_static_prio this major rotation was not a niced task.
+ * SCHED_BATCH tasks do not perform this check so they do not induce
+ * latencies in tasks of any nice level.
+ */
+static inline int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
-	DECLARE_BITMAP(tmp, PRIO_RANGE);
-	bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)],
-		  PRIO_RANGE);
-	return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
-		USER_PRIO(prio)));
+	if (p->static_prio < rq->best_static_prio && p->policy != SCHED_BATCH)
+		return SCHED_PRIO(find_first_zero_bit(p->bitmap, PRIO_RANGE));
+	else {
+		DECLARE_BITMAP(tmp, PRIO_RANGE);
+
+		bitmap_or(tmp, p->bitmap,
+			  prio_matrix[USER_PRIO(p->static_prio)],
+			  PRIO_RANGE);
+		return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
+			USER_PRIO(rq->prio_level)));
+	}
 }
 
 static void queue_expired(struct task_struct *p, struct rq *rq)
@@ -685,23 +711,12 @@ static void queue_expired(struct task_st
 static void recalc_task_prio(struct task_struct *p, struct rq *rq)
 {
 	struct prio_array *array = rq->active;
-	int queue_prio, search_prio = MAX_RT_PRIO;
-
-	/*
-	 * SCHED_BATCH tasks never start at better priority than any other
-	 * task that is already running since they are flagged as latency
-	 * insensitive. This means they never cause greater latencies in other
-	 * non SCHED_BATCH tasks of the same nice level, but they still will
-	 * not be exposed to high latencies themselves.
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		search_prio = rq->prio_level;
+	int queue_prio;
 
 	if (p->rotation == rq->prio_rotation) {
 		if (p->array == array) {
 			if (p->time_slice && rq_quota(rq, p->prio))
 				return;
-			search_prio = p->prio;
 		} else if (p->array == rq->expired) {
 			queue_expired(p, rq);
 			return;
@@ -710,7 +725,7 @@ static void recalc_task_prio(struct task
 	} else
 		task_new_array(p, rq);
 
-	queue_prio = next_prio_slot(p, search_prio);
+	queue_prio = next_entitled_slot(p, rq);
 	if (queue_prio >= MAX_PRIO) {
 		queue_expired(p, rq);
 		return;
@@ -867,7 +882,7 @@ static inline int normal_prio(struct tas
 	if (has_rt_policy(p))
 		return MAX_RT_PRIO-1 - p->rt_priority;
 	/* Other tasks all have normal_prio set in recalc_task_prio */
-	if (likely(p->prio >= MAX_RT_PRIO))
+	if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO))
 		return p->prio;
 	else
 		return p->static_prio;
@@ -902,10 +917,10 @@ static int effective_prio(struct task_st
  */
 static unsigned int rr_quota(struct task_struct *p)
 {
-	int neg_nice = -TASK_NICE(p), rr = rr_interval;
+	int nice = TASK_NICE(p), rr = rr_interval;
 
-	if (neg_nice > 6 && !rt_task(p)) {
-		rr *= neg_nice * neg_nice;
+	if (nice < -6 && !rt_task(p)) {
+		rr *= nice * nice;
 		rr /= 40;
 	}
 	return rr;
@@ -1522,7 +1537,7 @@ int fastcall wake_up_state(struct task_s
 	return try_to_wake_up(p, state, 0);
 }
 
-static void task_running_tick(struct rq *rq, struct task_struct *p);
+static void task_running_tick(struct rq *rq, struct task_struct *p, int tick);
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1584,7 +1599,7 @@ void fastcall sched_fork(struct task_str
 		 * a problem.
 		 */
 		current->time_slice = 1;
-		task_running_tick(cpu_rq(cpu), current);
+		task_running_tick(cpu_rq(cpu), current, 0);
 	}
 	local_irq_enable();
 out:
@@ -1659,14 +1674,16 @@ void fastcall wake_up_new_task(struct ta
  */
 void fastcall sched_exit(struct task_struct *p)
 {
+	struct task_struct *parent;
 	unsigned long flags;
 	struct rq *rq;
 
-	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > p->quota))
-			p->parent->time_slice = p->quota;
+	parent = p->parent;
+	rq = task_rq_lock(parent, &flags);
+	if (p->first_time_slice && task_cpu(p) == task_cpu(parent)) {
+		parent->time_slice += p->time_slice;
+		if (unlikely(parent->time_slice > parent->quota))
+			parent->time_slice = parent->quota;
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -3075,6 +3092,7 @@ static inline void major_prio_rotation(s
 	rq->active = new_array;
 	rq->exp_bitmap = rq->expired->prio_bitmap;
 	rq->dyn_bitmap = rq->active->prio_bitmap;
+	rq->best_static_prio = MAX_PRIO;
 	rq->prio_rotation++;
 }
 
@@ -3138,7 +3156,7 @@ static inline void rotate_runqueue_prior
 	rq_quota(rq, new_prio_level) += 1;
 }
 
-static void task_running_tick(struct rq *rq, struct task_struct *p)
+static void task_running_tick(struct rq *rq, struct task_struct *p, int tick)
 {
 	if (unlikely(!task_queued(p))) {
 		/* Task has expired but was not scheduled yet */
@@ -3161,6 +3179,13 @@ static void task_running_tick(struct rq 
 	if (!--p->time_slice)
 		task_expired_entitlement(rq, p);
 	/*
+	 * If we're actually calling this function not in a scheduler_tick
+	 * we are doing so to fix accounting across fork and should not be
+	 * deducting anything from rq_quota.
+	 */
+	if (!tick)
+		goto out_unlock;
+	/*
 	 * We only employ the deadline mechanism if we run over the quota.
 	 * It allows aliasing problems around the scheduler_tick to be
 	 * less harmful.
@@ -3171,6 +3196,7 @@ static void task_running_tick(struct rq 
 		rotate_runqueue_priority(rq);
 		set_tsk_need_resched(p);
 	}
+out_unlock:
 	spin_unlock(&rq->lock);
 }
 
@@ -3191,7 +3217,7 @@ void scheduler_tick(void)
 		/* Task on the idle queue */
 		wake_priority_sleeper(rq);
 	else
-		task_running_tick(rq, p);
+		task_running_tick(rq, p, 1);
 #ifdef CONFIG_SMP
 	update_load(rq);
 	if (time_after_eq(jiffies, rq->next_balance))
@@ -3373,20 +3399,13 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
-/* Is a dynamic_prio part of the allocated slots for this static_prio */
-static inline int entitled_slot(int static_prio, int dynamic_prio)
-{
-	return !test_bit(USER_PRIO(dynamic_prio),
-		prio_matrix[USER_PRIO(static_prio)]);
-}
-
 /*
  * If a task is queued at a priority that isn't from its bitmap we exchange
  * by setting one of the entitlement bits.
  */
-static inline void exchange_slot(struct task_struct *p, int prio)
+static inline void exchange_slot(struct task_struct *p, struct rq *rq)
 {
-	int slot = next_prio_slot(p, prio);
+	int slot = next_entitled_slot(p, rq);
 
 	if (slot < MAX_PRIO)
 		__set_bit(USER_PRIO(slot), p->bitmap);
@@ -3428,6 +3447,7 @@ retry:
 	}
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
+	rq->prio_level = idx;
 	/*
 	 * When the task is chosen it is checked to see if its quota has been
 	 * added to this runqueue level which is only performed once per
@@ -3437,17 +3457,16 @@ retry:
 			/* Task has moved during major rotation */
 			task_new_array(next, rq);
 			if (!entitled_slot(next->static_prio, idx))
-				exchange_slot(next, idx);
+				exchange_slot(next, rq);
 			set_task_entitlement(next);
 			rq_quota(rq, idx) += next->quota;
 	} else if (!test_bit(USER_PRIO(idx), next->bitmap)) {
 			/* Task has moved during minor rotation */
 			if (!entitled_slot(next->static_prio, idx))
-				exchange_slot(next, idx);
+				exchange_slot(next, rq);
 			set_task_entitlement(next);
 			rq_quota(rq, idx) += next->quota;
 	}
-	rq->prio_level = idx;
 	/*
 	 * next needs to have its prio and array reset here in case the
 	 * values are wrong due to priority rotation.
@@ -3540,8 +3559,13 @@ need_resched_nonpreemptible:
 	if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next))
 		next = rq->idle;
 switch_tasks:
-	if (next == rq->idle)
+	if (next == rq->idle) {
+		rq->best_static_prio = MAX_PRIO;
+		rq->prio_level = MAX_RT_PRIO;
+		rq->prio_rotation++;
 		schedstat_inc(rq, sched_goidle);
+	} else if (next->static_prio < rq->best_static_prio)
+		rq->best_static_prio = next->static_prio;
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
@@ -6892,6 +6916,7 @@ void __init sched_init(void)
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		rq->prio_rotation = 0;
+		rq->best_static_prio = MAX_PRIO;
 		rq->prio_level = MAX_RT_PRIO;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
Index: linux-2.6.20.3-rsdl/kernel/sysctl.c
===================================================================
--- linux-2.6.20.3-rsdl.orig/kernel/sysctl.c	2007-02-05 22:52:04.000000000 +1100
+++ linux-2.6.20.3-rsdl/kernel/sysctl.c	2007-03-22 13:21:26.000000000 +1100
@@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
+extern int rr_interval;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -228,6 +229,13 @@ static void register_proc_table(ctl_tabl
 static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
 #endif
 
+/* Constants for minimum and maximum testing in vm_table.
+   We use these as one-element integer vectors. */
+static int  __read_mostly zero;
+static int  __read_mostly one = 1;
+static int  __read_mostly one_hundred = 100;
+
+
 /* The default sysctl tables: */
 
 static ctl_table root_table[] = {
@@ -676,6 +684,17 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "rr_interval",
+		.data		= &rr_interval,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &one_hundred,
+	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
@@ -784,12 +803,6 @@ static ctl_table kern_table[] = {
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
