---
 Documentation/sched-design.txt     |  167 +++++++++--------
 include/asm-generic/bitops/sched.h |   10 -
 include/asm-s390/bitops.h          |   12 +
 kernel/sched.c                     |  354 +++++++++++++++++++------------------
 4 files changed, 288 insertions(+), 255 deletions(-)

Index: linux-2.6.21-rc3-rsdl/Documentation/sched-design.txt
===================================================================
--- linux-2.6.21-rc3-rsdl.orig/Documentation/sched-design.txt	2007-03-11 23:42:19.000000000 +1100
+++ linux-2.6.21-rc3-rsdl/Documentation/sched-design.txt	2007-03-16 23:35:35.000000000 +1100
@@ -196,17 +196,16 @@ constraints of strict fairness.
 Design description
 ==================
 
-RSDL works off the principle of providing each task a quota of runtime that
-it is allowed to run at each priority level equal to its static priority
-(ie. its nice level) and every priority below that. When each task is queued,
-the cpu that it is queued onto also keeps a record of that quota. If the
-task uses up its quota it is decremented one priority level. Also, if the cpu
-notices a quota full has been used for that priority level, it pushes
-everything remaining at that priority level to the next lowest priority
-level. Once every runtime quota has been consumed of every priority level,
-a task is queued on the "expired" array. When no other tasks exist with
-quota, the expired array is activated and fresh quotas are handed out. This
-is all done in O(1).
+RSDL works off the principle of providing each task a quota of runtime that it
+is allowed to run at a number of priority levels determined by its static
+priority (ie. its nice level). When each task is queued, the cpu that it is
+queued onto also keeps a record of that quota. If the task uses up its quota it
+has its priority decremented to the next level. Also, if the cpu notices a quota
+full has been used for that priority level, it pushes everything remaining at
+that priority level to the next lowest priority level. Once every runtime quota
+has been consumed of every priority level, a task is queued on the "expired"
+array. When no other tasks exist with quota, the expired array is activated and
+fresh quotas are handed out. This is all done in O(1).
 
 
 Design details
@@ -227,22 +226,25 @@ on in p->rotation. It also keeps a recor
 already been allocated quota from during this epoch in a bitmap p->bitmap.
 
 The only tunable that determines all other details is the RR_INTERVAL. This
-is set to 6ms (minimum on 1000HZ, higher at different HZ values).
+is set to 8ms (minimum on 1000HZ, higher at different HZ values), and is
+scaled gently upwards with more cpus.
 
 All tasks are initially given a quota based on RR_INTERVAL. This is equal to
-RR_INTERVAL between nice values of 0 and 19, and progressively larger for
-nice values from -1 to -20. This is assigned to p->quota and only changes
-with changes in nice level.
-
-As a task is first queued, it checks in recalc_task_prio to see if it has
-run at this runqueue's current priority rotation. If it has not, it will
-have its p->prio level set to equal its p->static_prio (nice level) and will
-be given a p->time_slice equal to the p->quota, and has its allocation
-bitmap bit set in p->bitmap for its static priority (nice value). This
-quota is then also added to the current runqueue's rq->prio_quota[p->prio].
-It is then queued on the current active priority array.
+RR_INTERVAL between nice values of 0 and 19, and progressively larger for nice
+values from -1 to -20. This is to maintain a relationship of nice 19 having
+approximately 1/20th of the cpu of nice 0, and nice 0 having 1/20th the cpu of
+nice -20. This is assigned to p->quota and only changes with changes in nice
+level.
+
+As a task is first queued, it checks in recalc_task_prio to see if it has run at
+this runqueue's current priority rotation. If it has not, it will have its
+p->prio level set according to the first slot in a "priority matrix" and will be
+given a p->time_slice equal to the p->quota, and has its allocation bitmap bit
+set in p->bitmap for this prio level. This quota is then also added to the
+current runqueue's rq->prio_quota[p->prio]. It is then queued on the current
+active priority array.
 
-If a task has already been running during this major epoch, if it has
+If a task has already been running during this major epoch, and it has
 p->time_slice left and the rq->prio_quota for the task's p->prio still
 has quota, it will be placed back on the active array, but no more quota
 will be added to either the task or the runqueue quota.
@@ -260,12 +262,7 @@ any entitlement left in p->bitmap and no
 bitmap cleared, and be queued at its p->static_prio again, but on the expired
 priority array. No quota will be allocated until this task is scheduled.
 
-When a task is queued, it has its static_prio bit set in the current
-runqueue's rq->static_bitmap, and the relevant bit in the rq->dyn_bitmap.
-In order to minimise the number of bitmap lookups, the bitmap of queued
-tasks on the expired array is at the end of the same bitmap as the active
-array. The number of tasks queued at the current static_prio is kept in
-rq->prio_queued[].
+When a task is queued, it has its relevant bit set in the array->prio_bitmap.
 
 During a scheduler_tick where a task is running, the p->time_slice is
 decremented, and if it reaches zero then the recalc_task_prio is readjusted
@@ -280,7 +277,7 @@ A minor rotation takes the remaining tas
 merges them with a list_splice_tail with the queue from the next lowest
 priority level. At this time, any tasks that have been merged will now
 have invalid values in p->prio so this must be considered when dequeueing
-the task, and for testing for preemption.
+and scheduling the task.
 
 A major rotation takes the remaining tasks at this priority level queue and
 merges them with a list_splice_tail with the best priority task running on
@@ -293,16 +290,43 @@ When a task is dequeued, the dyn_bitmap 
 that the relevant queue is actually empty since p->prio may be inaccurate
 and no hard accounting of the number of tasks at that level is possible.
 
-When selecting a new task for scheduling, after the first dynamic bit is
-found on the dyn_bitmap, it is checked to see that a task is really queued
-at that priority or if it is a false positive due to the task being
-dequeued at a time when its p->prio does not match which queue it is on
-after some form of priority rotation. This is a rare occurrence as it tends
-to only occur if a task that is already waiting on a runqueue gets dequeued.
-If the bitmap value is in the expired array range, a major priority rotation
-is performed. If the chosen task has not been running during this major or
-minor rotation it has new quota allocated at this time, and added to the
-runqueue's quota.
+When selecting a new task for scheduling, after the first dynamic bit is found
+on the dyn_bitmap, it is checked to see that a task is really queued at that
+priority or if it is a false positive due to the task being dequeued at a time
+when its p->prio does not match which queue it is on after some form of priority
+rotation. This is a rare occurrence as it tends to only occur if a task that is
+already waiting on a runqueue gets dequeued. If no tasks remain on the active
+array, a major priority rotation is performed. If the chosen task has not been
+running during this major or minor rotation it has new quota allocated at this
+time, and added to the runqueue's quota.
+
+If a task finds itself merged at a priority level that it does not normally
+receive quota at (due to list merging) it will remove one of its normal
+priority slots to compensate.
+
+
+Priority Matrix
+===============
+
+In order to minimise the latencies between tasks of different nice levels
+running concurrently, the dynamic priority slots where different nice levels
+are queued are dithered instead of being sequential. What this means is that
+there are 40 priority slots where a task may run during one major rotation,
+and the allocation of slots is dependant on nice level. In the
+following table, a zero represents a slot where the task may run.
+
+nice -20 0000000000000000000000000000000000000000
+nice -10 1001000100100010001001000100010010001000
+nice   0 0101010101010101010101010101010101010101
+nice   5 1101011010110101101011010110101101011011
+nice  10 0110111011011101110110111011101101110111
+nice  15 0111110111111011111101111101111110111111
+nice  19 1111111111111111111011111111111111111111
+
+As can be seen, a nice -20 task runs in every priority slot whereas a nice 19
+task only runs one slot per major rotation. This dithered table allows for the
+smallest possible maximum latencies between tasks of varying nice levels, thus
+allowing vastly different nice levels to be used.
 
 
 Modelling deadline behaviour
@@ -315,16 +339,16 @@ conditions. This is a virtual deadline m
 runqueue epochs, and not by trying to keep complicated accounting of each
 task.
 
-The maximum duration a task can run during one major epoch is determined
-by its nice value. Nice 0 tasks can run at 19 different priority levels
-for RR_INTERVAL duration during each epoch (the equivalent of nice 0 to nice
-19). Nice 10 tasks can run at 9 priority levels for each epoch, and so on.
+The maximum duration a task can run during one major epoch is determined by its
+nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL
+duration during each epoch. Nice 10 tasks can run at 9 priority levels for each
+epoch, and so on. The table in the priority matrix above demonstrates how this
+is enforced.
 
 Therefore the maximum duration a runqueue epoch can take is determined by
 the number of tasks running, and their nice level. After that, the maximum
 duration it can take before a task can wait before it get scheduled is
-determined by the difference between its nice value and the nice value of
-the highest priority task queued.
+determined by the position of its first slot on the matrix.
 
 In the following examples, these are _worst case scenarios_ and would rarely
 occur, but can be modelled nonetheless to determine the maximum possible
@@ -335,41 +359,21 @@ another is activated for the first time 
 runqueue rotation, the first task will wait:
 
 nr_tasks * max_duration + nice_difference * rr_interval
-1 * 19 * RR_INTERVAL + 0 = 114ms
+1 * 19 * RR_INTERVAL + 0 = 152ms
 
 In the presence of a nice 10 task, a nice 0 task would wait a maximum of
-1 * 10 * RR_INTERVAL + 0 = 60ms
+1 * 10 * RR_INTERVAL + 0 = 80ms
 
 In the presence of a nice 0 task, a nice 10 task would wait a maximum of
-1 * 19 * RR_INTERVAL + 9 * RR_INTERVAL = 168ms
+1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms
 
-Using a more complicated example, if there are 4 tasks running fully cpu
-bound, one each at nice -20, nice 0, nice 10 and nice 19, we can calculate
-the maximum latency possible for the nice 10 task. Note that -20 tasks are
-heavily biased for so this will be a long time, but can be modelled.
-
-The nice -20 task has quota = RR_INTERVAL + 20*RR_INTERVAL = 21*RR_INTERVAL.
-It can run at 39 priority levels so its maximum duration =
-39 * 21 * RR_INTERVAL.
-The nice 0 task works out to
-19 * RR_INTERVAL
-The nice 19 task works out to
-RR_INTERVAL.
-
-So major epoch can take up a maximum of
-39 * 21 * RR_INTERVAL + 19 * RR_INTERVAL + RR_INTERVAL = 1229 * RR_INTERVAL;
-
-Then before the nice 10 task will run, the nice -20 and nice 0 task will
-run for 28 * 21 * RR_INTERVAL and 9 * RR_INTERVAL respectively for a total
-of 597 * RR_INTERVAL.
-
-This means the maximum duration a nice 10 task can wait in the presence of
-these other tasks is 1826*RR_INTERVAL. This is a long time of course and is
-heavily penalised by the presence of nice -20 tasks which would not be part
-of a normal environment.
-
-While this section describes the maximum latency a task can have, this size
-latencies will only be seen by fully cpu bound tasks.
+More useful than these values, though, are the average latencies which are
+a matter of determining the average distance between priority slots of
+different nice values and multiplying them by the tasks' quota. For example
+in the presence of a nice -10 task, a nice 0 task will wait either one or
+two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL,
+this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or
+20 and 40ms respectively (on uniprocessor at 1000HZ).
 
 
 Achieving interactivity
@@ -421,6 +425,13 @@ current task if it is not of a sleeping 
 low latency for interactive tasks, and the lowest latencies for the least
 cpu bound tasks.
 
+One of the potential disadvantages of a strict fairness design is that users
+may prefer a degree of unfairness towards certain tasks (such as a gui) and
+will notice the relative slowdown that occurs under load. As the dithered
+matrix minimises the latencies when differential nice levels are used, this
+can be countered by running a gui at a negative nice value such as -10 without
+causing adversely large latencies in nice 0 tasks.
+
 
-Wed, 28 Feb 2007
+Fri, 16 Mar 2007
 Con Kolivas <kernel@kolivas.org>
Index: linux-2.6.21-rc3-rsdl/include/asm-generic/bitops/sched.h
===================================================================
--- linux-2.6.21-rc3-rsdl.orig/include/asm-generic/bitops/sched.h	2007-03-11 23:42:19.000000000 +1100
+++ linux-2.6.21-rc3-rsdl/include/asm-generic/bitops/sched.h	2007-03-16 23:35:35.000000000 +1100
@@ -6,8 +6,8 @@
 
 /*
  * Every architecture must define this function. It's the fastest
- * way of searching a 180-bit bitmap where the first 100 bits are
- * unlikely to be set. It's guaranteed that at least one of the 180
+ * way of searching a 140-bit bitmap where the first 100 bits are
+ * unlikely to be set. It's guaranteed that at least one of the 140
  * bits is cleared.
  */
 static inline int sched_find_first_bit(const unsigned long *b)
@@ -15,7 +15,7 @@ static inline int sched_find_first_bit(c
 #if BITS_PER_LONG == 64
 	if (unlikely(b[0]))
 		return __ffs(b[0]);
-	if (b[1])
+	if (likely(b[1]))
 		return __ffs(b[1]) + 64;
 	return __ffs(b[2]) + 128;
 #elif BITS_PER_LONG == 32
@@ -27,9 +27,7 @@ static inline int sched_find_first_bit(c
 		return __ffs(b[2]) + 64;
 	if (b[3])
 		return __ffs(b[3]) + 96;
-	if (b[4])
-		return __ffs(b[4]) + 128;
-	return __ffs(b[5]) + 160;
+	return __ffs(b[4]) + 128;
 #else
 #error BITS_PER_LONG not defined
 #endif
Index: linux-2.6.21-rc3-rsdl/include/asm-s390/bitops.h
===================================================================
--- linux-2.6.21-rc3-rsdl.orig/include/asm-s390/bitops.h	2007-03-11 23:42:19.000000000 +1100
+++ linux-2.6.21-rc3-rsdl/include/asm-s390/bitops.h	2007-03-16 23:35:35.000000000 +1100
@@ -729,7 +729,17 @@ find_next_bit (const unsigned long * add
 	return offset + find_first_bit(p, size);
 }
 
-#include <asm-generic/bitops/sched.h>
+/*
+ * Every architecture must define this function. It's the fastest
+ * way of searching a 140-bit bitmap where the first 100 bits are
+ * unlikely to be set. It's guaranteed that at least one of the 140
+ * bits is cleared.
+ */
+static inline int sched_find_first_bit(unsigned long *b)
+{
+	return find_first_bit(b, 140);
+}
+
 #include <asm-generic/bitops/ffs.h>
 
 #include <asm-generic/bitops/fls.h>
Index: linux-2.6.21-rc3-rsdl/kernel/sched.c
===================================================================
--- linux-2.6.21-rc3-rsdl.orig/kernel/sched.c	2007-03-12 10:48:58.000000000 +1100
+++ linux-2.6.21-rc3-rsdl/kernel/sched.c	2007-03-17 00:05:16.000000000 +1100
@@ -17,7 +17,7 @@
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-03-02	Rotating Staircase deadline scheduling policy by Con Kolivas
- *		RSDL v0.30
+ *		RSDL v0.31
  */
 
 #include <linux/mm.h>
@@ -86,25 +86,34 @@ unsigned long long __attribute__((weak))
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
-#define MAX_DYN_PRIO		(MAX_PRIO + PRIO_RANGE)
 
-/*
- * Preemption needs to take into account that a low priority task can be
- * at a higher prio due to list merging. Its priority is artificially
- * elevated and it should be preempted if anything higher priority wakes up
- * provided it is not a realtime comparison.
- */
-#define TASK_PREEMPTS_CURR(p, curr) \
-	(((p)->prio < (curr)->prio) || (!rt_task(p) && \
-		((p)->static_prio < (curr)->static_prio && \
-			((curr)->static_prio > (curr)->prio))))
+#define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
 /*
  * This is the time all tasks within the same priority round robin.
- * Set to a minimum of 6ms.
+ * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ.
  */
-#define RR_INTERVAL		((6 * HZ / 1001) + 1)
-#define DEF_TIMESLICE		(RR_INTERVAL * 20)
+static unsigned int rr_interval __read_mostly;
+#define RR_INTERVAL		8
+#define DEF_TIMESLICE		(rr_interval * 20)
+
+/*
+ * This contains a bitmap for each dynamic priority level with empty slots
+ * for the valid priorities each different nice level can have. It allows
+ * us to stagger the slots where differing priorities run in a way that
+ * keeps latency differences between different nice levels at a minimum.
+ * ie, where 0 means a slot for that priority, priority running from left to
+ * right:
+ * nice -20 0000000000000000000000000000000000000000
+ * nice -10 1001000100100010001001000100010010001000
+ * nice   0 0101010101010101010101010101010101010101
+ * nice   5 1101011010110101101011010110101101011011
+ * nice  10 0110111011011101110110111011101101110111
+ * nice  15 0111110111111011111101111101111110111111
+ * nice  19 1111111111111111111011111111111111111111
+  */
+static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
+				 __read_mostly;
 
 /*
  * These are the runqueue data structures:
@@ -112,6 +121,12 @@ unsigned long long __attribute__((weak))
 struct prio_array {
 	struct list_head queue[MAX_PRIO];
 	/* Tasks queued at each priority */
+
+	DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
+	/*
+	 * The bitmap of priorities queued; The dynamic bits can have
+	 * false positives. Include 1 bit for delimiter.
+	 */
 };
 
 /*
@@ -149,20 +164,6 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	DECLARE_BITMAP(dyn_bitmap, MAX_DYN_PRIO + 1);
-	/*
-	 * The bitmap of priorities queued; The extra PRIO_RANGE at the end
-	 * is for a bitmap of expired tasks queued. This minimises the number
-	 * of bit lookups over prio_array swaps. The dynamic bits can have
-	 * false positives. Include 1 bit for delimiter.
-	 */
-
-	DECLARE_BITMAP(static_bitmap, MAX_PRIO);
-	/* The bitmap of all static priorities queued */
-
-	unsigned long prio_queued[MAX_PRIO];
-	/* The number of tasks at each static priority */
-
 	long prio_quota[PRIO_RANGE];
 	/*
 	 * The quota of ticks the runqueue runs at each dynamic priority
@@ -170,6 +171,7 @@ struct rq {
 	 */
 
 	struct prio_array *active, *expired, arrays[2];
+	unsigned long *dyn_bitmap, *exp_bitmap;
 
 	int prio_level;
 	/* The current dynamic priority level this runqueue is at */
@@ -622,34 +624,16 @@ static inline int task_queued(struct tas
 static inline void set_task_entitlement(struct task_struct *p)
 {
 	__set_bit(USER_PRIO(p->prio), p->bitmap);
-
-	/*
-	 * In the case this task has been part of a merged list that has
-	 * made it to higher priority than it should be, we remove the
-	 * quota from its own priority since it will get a quota at this
-	 * priority.
-	 */
-	if (p->normal_prio < p->static_prio)
-		__set_bit(USER_PRIO(p->static_prio), p->bitmap);
 	p->time_slice = p->quota;
 }
 
 /*
- * Only the static_bitmap has hard accounting. The dynamic bits can have
+ * There is no specific hard accounting. The dynamic bits can have
  * false positives. rt_tasks can only be on the active queue.
  */
 static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
 {
-	if (p->array == rq->active)
-		__set_bit(p->prio, rq->dyn_bitmap);
-	else
-		__set_bit(p->prio + PRIO_RANGE, rq->dyn_bitmap);
-}
-
-static inline void set_queue_bits(struct rq *rq, struct task_struct *p)
-{
-	__set_bit(p->static_prio, rq->static_bitmap);
-	set_dynamic_bit(p, rq);
+	__set_bit(p->prio, p->array->prio_bitmap);
 }
 
 /*
@@ -661,15 +645,8 @@ static inline void set_queue_bits(struct
 static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
 	list_del_init(&p->run_list);
-	if (!--rq->prio_queued[p->static_prio])
-		__clear_bit(p->static_prio, rq->static_bitmap);
-	if (list_empty(p->array->queue + p->prio)) {
-		int bitmap_prio = p->prio;
-
-		if (p->array == rq->expired)
-			bitmap_prio += PRIO_RANGE;
-		__clear_bit(bitmap_prio, rq->dyn_bitmap);
-	}
+	if (list_empty(p->array->queue + p->prio))
+		__clear_bit(p->prio, p->array->prio_bitmap);
 }
 
 /*
@@ -682,16 +659,31 @@ static inline void task_new_array(struct
 	p->rotation = rq->prio_rotation;
 }
 
-static inline void queue_expired(struct task_struct *p, struct rq *rq)
+static inline int first_prio_slot(struct task_struct *p)
+{
+	return SCHED_PRIO(find_first_zero_bit(
+		prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
+}
+
+static inline int next_prio_slot(struct task_struct *p, int prio)
+{
+	DECLARE_BITMAP(tmp, PRIO_RANGE);
+	bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)],
+		  PRIO_RANGE);
+	return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
+		USER_PRIO(prio)));
+}
+
+static void queue_expired(struct task_struct *p, struct rq *rq)
 {
-	p->prio = p->normal_prio = p->static_prio;
 	p->array = rq->expired;
-	bitmap_zero(p->bitmap, PRIO_RANGE);
-	p->rotation = rq->prio_rotation;
+	task_new_array(p, rq);
+	p->prio = p->normal_prio = first_prio_slot(p);
 	p->time_slice = p->quota;
 }
 
 #define rq_quota(rq, prio)	((rq)->prio_quota[USER_PRIO(prio)])
+
 /*
  * recalc_task_prio determines what prio a non rt_task will be
  * queued at. If the task has already been running during this runqueue's
@@ -706,12 +698,23 @@ static inline void queue_expired(struct 
 static void recalc_task_prio(struct task_struct *p, struct rq *rq)
 {
 	struct prio_array *array = rq->active;
-	int queue_prio, search_prio;
+	int queue_prio, search_prio = MAX_RT_PRIO;
+
+	/*
+	 * SCHED_BATCH tasks never start at better priority than any other
+	 * task that is already running since they are flagged as latency
+	 * insensitive. This means they never cause greater latencies in other
+	 * non SCHED_BATCH tasks of the same nice level, but they still will
+	 * not be exposed to high latencies themselves.
+	 */
+	if (unlikely(p->policy == SCHED_BATCH))
+		search_prio = rq->prio_level;
 
 	if (p->rotation == rq->prio_rotation) {
 		if (p->array == array) {
 			if (p->time_slice && rq_quota(rq, p->prio))
 				return;
+			search_prio = p->prio;
 		} else if (p->array == rq->expired) {
 			queue_expired(p, rq);
 			return;
@@ -719,19 +722,9 @@ static void recalc_task_prio(struct task
 			task_new_array(p, rq);
 	} else
 		task_new_array(p, rq);
-	search_prio = p->static_prio;
 
-	/*
-	 * SCHED_BATCH tasks never start at better priority than any other
-	 * task that is already running since they are flagged as latency
-	 * insensitive. This means they never cause greater latencies in other
-	 * non SCHED_BATCH tasks of the same nice level.
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		search_prio = max(p->static_prio, rq->prio_level);
-	queue_prio = SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
-		     USER_PRIO(search_prio)));
-	if (queue_prio == MAX_PRIO) {
+	queue_prio = next_prio_slot(p, search_prio);
+	if (queue_prio >= MAX_PRIO) {
 		queue_expired(p, rq);
 		return;
 	}
@@ -745,9 +738,7 @@ static void recalc_task_prio(struct task
  * Adding to a runqueue. The dynamic priority queue that it is added to is
  * determined by the priority rotation of the runqueue it is being added to
  * and the quota still available in the task in p->bitmap and p->time_slice
- * (see recalc_task_prio above). The rq static_bitmap stores a list of
- * the static priorities, and prio_queued the number of tasks stored at each
- * p->static_prio level.
+ * (see recalc_task_prio above).
  */
 static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
 {
@@ -755,10 +746,9 @@ static inline void __enqueue_task(struct
 		p->array = rq->active;
 	else
 		recalc_task_prio(p, rq);
-	rq->prio_queued[p->static_prio]++;
 
 	sched_info_queued(p);
-	set_queue_bits(rq, p);
+	set_dynamic_bit(p, rq);
 }
 
 static void enqueue_task(struct task_struct *p, struct rq *rq)
@@ -780,13 +770,12 @@ static inline void enqueue_task_head(str
 static void requeue_task(struct task_struct *p, struct rq *rq,
 			 struct prio_array *old_array, int old_prio)
 {
+	if (p->array == rq->expired)
+		queue_expired(p, rq);
 	list_move_tail(&p->run_list, p->array->queue + p->prio);
 	if (!rt_task(p)) {
-		if (list_empty(old_array->queue + old_prio)) {
-			if (old_array == rq->expired)
-				old_prio += PRIO_RANGE;
-			__clear_bit(old_prio, rq->dyn_bitmap);
-		}
+		if (list_empty(old_array->queue + old_prio))
+			__clear_bit(old_prio, p->array->prio_bitmap);
 		set_dynamic_bit(p, rq);
 	}
 }
@@ -809,7 +798,7 @@ static inline unsigned int task_timeslic
 	unsigned int slice, rr;
 
 	slice = rr = p->quota;
-	if (likely(!rt_task(p)))
+	if (!rt_task(p))
 		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
 	return slice;
 }
@@ -824,7 +813,7 @@ static inline unsigned int task_timeslic
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define TASK_LOAD_WEIGHT(p)	LOAD_WEIGHT(task_timeslice(p))
 #define RTPRIO_TO_LOAD_WEIGHT(rp)	\
-	(LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
+	(LOAD_WEIGHT((rr_interval + 20 + (rp))))
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -891,7 +880,10 @@ static inline int normal_prio(struct tas
 	if (has_rt_policy(p))
 		return MAX_RT_PRIO-1 - p->rt_priority;
 	/* Other tasks all have normal_prio set in recalc_task_prio */
-	return p->static_prio;
+	if (likely(p->prio >= MAX_RT_PRIO))
+		return p->prio;
+	else
+		return p->static_prio;
 }
 
 /*
@@ -914,17 +906,22 @@ static int effective_prio(struct task_st
 }
 
 /*
- * All tasks have quotas based on RR_INTERVAL. From nice 0 to 19 they are
- * all equal to it and below zero they get progressively larger making their
- * effective quota significantly larger. rt tasks all get RR_INTERVAL.
- */
-static unsigned int rr_interval(struct task_struct *p)
-{
-	int nice = TASK_NICE(p);
-
-	if (nice < 0 && !rt_task(p))
-		return RR_INTERVAL * (20 - nice) / 20;
-	return RR_INTERVAL;
+ * All tasks have quotas based on rr_interval. From nice 0 to 19 they are
+ * all equal to it and below zero they get exponentially larger making their
+ * effective quota significantly larger. rt tasks all get rr_interval.
+ * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval
+ * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0
+ * similar to the ratios between 0 and +19.
+ */
+static unsigned int rr_quota(struct task_struct *p)
+{
+	int neg_nice = -TASK_NICE(p), rr = rr_interval;
+
+	if (neg_nice > 6 && !rt_task(p)) {
+		rr *= neg_nice * neg_nice;
+		rr /= 40;
+	}
+	return rr;
 }
 
 /*
@@ -954,7 +951,7 @@ static void activate_task(struct task_st
 				     (now - p->timestamp) >> 20);
 	}
 
-	p->quota = rr_interval(p);
+	p->quota = rr_quota(p);
 	p->prio = effective_prio(p);
 	p->timestamp = now;
 	__activate_task(p, rq);
@@ -2076,6 +2073,17 @@ int can_migrate_task(struct task_struct 
 	return 1;
 }
 
+static inline int rq_best_prio(struct rq *rq)
+{
+	int best_prio, exp_prio;
+
+	best_prio = sched_find_first_bit(rq->dyn_bitmap);
+	exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO);
+	if (unlikely(best_prio > exp_prio))
+		best_prio = exp_prio;
+	return best_prio;
+}
+
 /*
  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
  * load from busiest to this_rq, as part of a balancing operation within
@@ -2088,7 +2096,7 @@ static int move_tasks(struct rq *this_rq
 		      struct sched_domain *sd, enum idle_type idle,
 		      int *all_pinned)
 {
-	int idx, test_idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
 	    best_prio_seen, skip_for_load;
 	struct prio_array *array;
 	struct list_head *head, *curr;
@@ -2100,8 +2108,8 @@ static int move_tasks(struct rq *this_rq
 
 	rem_load_move = max_load_move;
 	pinned = 1;
-	this_best_prio = this_rq->curr->prio;
-	best_prio = busiest->curr->prio;
+	this_best_prio = rq_best_prio(this_rq);
+	best_prio = rq_best_prio(busiest);
 	/*
 	 * Enable handling of the case where there is more than one task
 	 * with the best priority.   If the current running task is one
@@ -2115,33 +2123,27 @@ static int move_tasks(struct rq *this_rq
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them. This is done by starting the search at priority
-	 * MAX_PRIO since expired bits are MAX_PRIO...MAX_DYN_PRIO-1
+	 * on them.
 	 */
 	array = busiest->expired;
-	test_idx = MAX_PRIO;
+new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
 skip_bitmap:
-	if (!test_idx)
-		idx = sched_find_first_bit(busiest->dyn_bitmap);
+	if (!idx)
+		idx = sched_find_first_bit(array->prio_bitmap);
 	else
-		idx = find_next_bit(busiest->dyn_bitmap, MAX_DYN_PRIO,
-		      test_idx);
-	if (idx >= MAX_DYN_PRIO) {
+		idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
 		if (array == busiest->expired) {
 			array = busiest->active;
-			test_idx = 0;
-			goto skip_bitmap;
+			goto new_array;
 		}
 		goto out;
 	}
-	test_idx = idx;
-	if (idx >= MAX_PRIO) {
-		if (array == busiest->active)
-			goto out;
-		idx -= PRIO_RANGE;
-	}
-	if (list_empty(array->queue + idx)) {
-		__clear_bit(test_idx, busiest->dyn_bitmap);
+
+	if (unlikely(list_empty(array->queue + idx))) {
+		__clear_bit(idx, array->prio_bitmap);
 		goto skip_bitmap;
 	}
 
@@ -2166,7 +2168,7 @@ skip_queue:
 		best_prio_seen |= idx == best_prio;
 		if (curr != head)
 			goto skip_queue;
-		test_idx++;
+		idx++;
 		goto skip_bitmap;
 	}
 
@@ -2183,7 +2185,7 @@ skip_queue:
 			this_best_prio = idx;
 		if (curr != head)
 			goto skip_queue;
-		test_idx++;
+		idx++;
 		goto skip_bitmap;
 	}
 out:
@@ -3076,10 +3078,9 @@ static inline void major_prio_rotation(s
 
 	rq->expired = rq->active;
 	rq->active = new_array;
+	rq->exp_bitmap = rq->expired->prio_bitmap;
+	rq->dyn_bitmap = rq->active->prio_bitmap;
 	rq->prio_rotation++;
-	bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
-	bitmap_copy(rq->dyn_bitmap, rq->static_bitmap, MAX_PRIO);
-	__set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
 }
 
 /*
@@ -3110,14 +3111,12 @@ static inline void rotate_runqueue_prior
 		struct prio_array *new_queue = rq->expired;
 
 		/*
-		 * The static_bitmap gives us the highest p->static prio task
-		 * that is queued. This value is used as the prio after
-		 * the major rotation and all tasks remaining on this
-		 * active queue are moved there. This means tasks can end
-		 * up a p->prio better than their p->static_prio.
+		 * On a major rotation we move everything remaining to best
+		 * priority on the new array. The priority matrix bitmap will
+		 * ensure tasks only get the slots each static priority
+		 * deserves.
 		 */
-		new_prio_level = find_next_bit(rq->static_bitmap, MAX_PRIO,
-				 MAX_RT_PRIO);
+		new_prio_level = MAX_RT_PRIO;
 		if (!list_empty(array->queue + rq->prio_level)) {
 			list_splice_tail_init(array->queue + rq->prio_level,
 					 new_queue->queue + new_prio_level);
@@ -3240,39 +3239,23 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
+/* Is a dynamic_prio part of the allocated slots for this static_prio */
+static inline int entitled_slot(int static_prio, int dynamic_prio)
+{
+	return !test_bit(USER_PRIO(dynamic_prio),
+		prio_matrix[USER_PRIO(static_prio)]);
+}
+
 /*
- * Leave this debugging in until we are certain all bitmap manipulations are
- * working as desired since we can safely get out of this situation.
+ * If a task is queued at a priority that isn't from its bitmap we exchange
+ * by setting one of the entitlement bits.
  */
-static noinline int rq_bitmap_error(struct rq *rq)
+static inline void exchange_slot(struct task_struct *p, int prio)
 {
-	static int bitmap_error = 0;
-	struct prio_array *array;
-	struct list_head *queue;
-	int idx, test_idx;
+	int slot = next_prio_slot(p, prio);
 
-	printk(KERN_ERR
-	       "SCHEDULER BITMAP ERROR %d - attempting to reconstruct...\n",
-	       ++bitmap_error);
-	for (test_idx = MAX_RT_PRIO ; test_idx < MAX_DYN_PRIO ; test_idx++) {
-		if (test_idx < MAX_PRIO) {
-			idx = test_idx;
-			array = rq->active;
-		} else {
-			idx = test_idx - PRIO_RANGE;
-			array = rq->expired;
-		}
-		queue = array->queue + idx;
-		if (!list_empty(queue)) {
-			if (!test_bit(test_idx, rq->dyn_bitmap)) {
-				__set_bit(test_idx, rq->dyn_bitmap);
-			}
-		}
-	}
-	idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, MAX_RT_PRIO);
-	/* We hit a real bug. There is no way out of this */
-	BUG_ON(idx == MAX_DYN_PRIO);
-	return idx;
+	if (slot < MAX_PRIO)
+		__set_bit(USER_PRIO(slot), p->bitmap);
 }
 
 /*
@@ -3285,18 +3268,18 @@ static inline struct task_struct *next_d
 	struct task_struct *next;
 	struct list_head *queue;
 	struct prio_array *array = rq->active;
+	int expirations = 0;
 
 retry:
-	if (unlikely(idx == MAX_DYN_PRIO))
-		idx = rq_bitmap_error(rq);
 	if (idx >= MAX_PRIO) {
+		BUG_ON(++expirations > 1);
 		/*
 		 * We have selected a bit from the expired range so there are
 		 * no more tasks in the active array.
 		 */
 		major_prio_rotation(rq);
 		array = rq->active;
-		idx -= PRIO_RANGE;
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
 	}
 	if (unlikely(list_empty(array->queue + idx))) {
 		/*
@@ -3306,7 +3289,7 @@ retry:
 		 * interim. A very rare occurrence.
 		 */
 		__clear_bit(idx, rq->dyn_bitmap);
-		idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, idx + 1);
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1);
 		goto retry;
 	}
 	queue = array->queue + idx;
@@ -3319,10 +3302,14 @@ retry:
 	if (next->rotation != rq->prio_rotation) {
 			/* Task has moved during major rotation */
 			task_new_array(next, rq);
+			if (!entitled_slot(next->static_prio, idx))
+				exchange_slot(next, idx);
 			set_task_entitlement(next);
 			rq_quota(rq, idx) += next->quota;
 	} else if (!test_bit(USER_PRIO(idx), next->bitmap)) {
 			/* Task has moved during minor rotation */
+			if (!entitled_slot(next->static_prio, idx))
+				exchange_slot(next, idx);
 			set_task_entitlement(next);
 			rq_quota(rq, idx) += next->quota;
 	}
@@ -3934,6 +3921,7 @@ void set_user_nice(struct task_struct *p
 			resched_task(rq->curr);
 	}
 out_unlock:
+	p->quota = rr_quota(p);
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
@@ -6735,6 +6723,26 @@ int in_sched_functions(unsigned long add
 void __init sched_init(void)
 {
 	int i, j, k;
+	unsigned int rr_us = 0, rr_inc = RR_INTERVAL * 1000;
+
+	/* Generate the priority matrix */
+	for (i = 0; i < PRIO_RANGE; i++) {
+		if (i < 20) {
+			bitmap_zero(prio_matrix[i] , PRIO_RANGE);
+			j = PRIO_RANGE * PRIO_RANGE / (i + 1);
+			for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j)
+				__set_bit(k / PRIO_RANGE, prio_matrix[i]);
+		} else if (i == 20) {
+			bitmap_fill(prio_matrix[i], PRIO_RANGE);
+			for (k = 1; k < PRIO_RANGE; k += 2)
+				__clear_bit(k, prio_matrix[i]);
+		} else {
+			bitmap_fill(prio_matrix[i], PRIO_RANGE);
+			j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i + 1);
+			for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j)
+				__clear_bit(k / PRIO_RANGE, prio_matrix[i]);
+		}
+	}
 
 	for_each_possible_cpu(i) {
 		struct prio_array *array;
@@ -6748,6 +6756,8 @@ void __init sched_init(void)
 		rq->prio_level = MAX_RT_PRIO;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
+		rq->dyn_bitmap = rq->active->prio_bitmap;
+		rq->exp_bitmap = rq->expired->prio_bitmap;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
@@ -6766,14 +6776,18 @@ void __init sched_init(void)
 			array = rq->arrays + j;
 			for (k = 0; k < MAX_PRIO; k++)
 				INIT_LIST_HEAD(array->queue + k);
+			bitmap_zero(array->prio_bitmap, MAX_PRIO);
+			/* delimiter for bitsearch */
+			__set_bit(MAX_PRIO, array->prio_bitmap);
 		}
 		for (k = 0; k < PRIO_RANGE; k++)
 			rq->prio_quota[k] = 0;
-		bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
-		bitmap_zero(rq->static_bitmap, MAX_PRIO);
-		/* delimiter for bitsearch */
-		__set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
+
+		/* Every added cpu increases the rr_interval */
+		rr_us += rr_inc;
+		rr_inc /= 2;
 	}
+	rr_interval = rr_us / 1000 ? : 1;
 
 	set_load_weight(&init_task);
 
