Modify the priority bitmaps of different nice levels to be dithered
minimising the latency likely when different nice levels are used. This
allows low cpu using relatively niced tasks to still get low latency in the
presence of less niced tasks.

Fix the accounting on -nice levels to not be scaled by HZ.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

---
 include/linux/sched.h |    1 
 kernel/sched.c        |  360 +++++++++++++++++++++++++-------------------------
 2 files changed, 185 insertions(+), 176 deletions(-)

Index: linux-2.6.21-rc3-mm2/kernel/sched.c
===================================================================
--- linux-2.6.21-rc3-mm2.orig/kernel/sched.c	2007-03-15 22:05:51.000000000 +1100
+++ linux-2.6.21-rc3-mm2/kernel/sched.c	2007-03-15 23:44:32.000000000 +1100
@@ -87,25 +87,35 @@ unsigned long long __attribute__((weak))
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
-#define MAX_DYN_PRIO		(MAX_PRIO + PRIO_RANGE)
+#define EXP_BITSIZE		(MAX_RT_PRIO + PRIO_RANGE / 2 + 1)
 
-/*
- * Preemption needs to take into account that a low priority task can be
- * at a higher prio due to list merging. Its priority is artificially
- * elevated and it should be preempted if anything higher priority wakes up
- * provided it is not a realtime comparison.
- */
-#define TASK_PREEMPTS_CURR(p, curr) \
-	(((p)->prio < (curr)->prio) || (!rt_task(p) && \
-		((p)->static_prio < (curr)->static_prio && \
-			((curr)->static_prio > (curr)->prio))))
+#define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
 /*
  * This is the time all tasks within the same priority round robin.
- * Set to a minimum of 6ms.
+ * Set to a minimum of 6ms. Scales with number of cpus and rounds with HZ.
  */
-#define RR_INTERVAL		((6 * HZ / 1001) + 1)
+#define RR_INTERVAL		6
 #define DEF_TIMESLICE		(RR_INTERVAL * 20)
+static unsigned int rr_interval __read_mostly;
+
+/*
+ * This contains a bitmap for each dynamic priority level with empty slots
+ * for the valid priorities each different nice level can have. It allows
+ * us to stagger the slots where differing priorities run in a way that
+ * keeps latency differences between different nice levels at a minimum.
+ * ie, where 0 means a slot for that priority, priority running from left to
+ * right:
+ * nice -20 0000000000000000000000000000000000000000
+ * nice -10 1001000100100010001001000100010010001000
+ * nice   0 0101010101010101010101010101010101010101
+ * nice   5 1101011010110101101011010110101101011011
+ * nice  10 0110111011011101110110111011101101110111
+ * nice  15 0111110111111011111101111101111110111111
+ * nice  19 1111111111111111111011111111111111111111
+  */
+static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
+				 __read_mostly;
 
 #ifdef CONFIG_SMP
 /*
@@ -134,6 +144,12 @@ static inline void sg_inc_cpu_power(stru
 struct prio_array {
 	struct list_head queue[MAX_PRIO];
 	/* Tasks queued at each priority */
+
+	DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
+	/*
+	 * The bitmap of priorities queued; The dynamic bits can have
+	 * false positives. Include 1 bit for delimiter.
+	 */
 };
 
 /*
@@ -175,20 +191,6 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	DECLARE_BITMAP(dyn_bitmap, MAX_DYN_PRIO + 1);
-	/*
-	 * The bitmap of priorities queued; The extra PRIO_RANGE at the end
-	 * is for a bitmap of expired tasks queued. This minimises the number
-	 * of bit lookups over prio_array swaps. The dynamic bits can have
-	 * false positives. Include 1 bit for delimiter.
-	 */
-
-	DECLARE_BITMAP(static_bitmap, MAX_PRIO);
-	/* The bitmap of all static priorities queued */
-
-	unsigned long prio_queued[MAX_PRIO];
-	/* The number of tasks at each static priority */
-
 	long prio_quota[PRIO_RANGE];
 	/*
 	 * The quota of ticks the runqueue runs at each dynamic priority
@@ -196,6 +198,7 @@ struct rq {
 	 */
 
 	struct prio_array *active, *expired, arrays[2];
+	unsigned long *dyn_bitmap, *exp_bitmap;
 
 	int prio_level;
 	/* The current dynamic priority level this runqueue is at */
@@ -648,35 +651,26 @@ static inline int task_queued(struct tas
 
 static inline void set_task_entitlement(struct task_struct *p)
 {
-	__set_bit(USER_PRIO(p->prio), p->bitmap);
-
-	/*
-	 * In the case this task has been part of a merged list that has
-	 * made it to higher priority than it should be, we remove the
-	 * quota from its own priority since it will get a quota at this
-	 * priority.
-	 */
-	if (p->normal_prio < p->static_prio)
-		__set_bit(USER_PRIO(p->static_prio), p->bitmap);
+	if (__test_and_set_bit(USER_PRIO(p->prio), p->bitmap)) {
+		int first_slot = find_first_zero_bit(p->bitmap, PRIO_RANGE);
+		/*
+		 * If the bit is already set then this task has been queued
+		 * at this priority by queue merging. For fair accounting we
+		 * set one of the unused slots if available.
+		 */
+		 if (first_slot < PRIO_RANGE)
+		 	__set_bit(first_slot, p->bitmap);
+	}
 	p->time_slice = p->quota;
 }
 
 /*
- * Only the static_bitmap has hard accounting. The dynamic bits can have
+ * There is no specific hard accounting. The dynamic bits can have
  * false positives. rt_tasks can only be on the active queue.
  */
 static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
 {
-	if (p->array == rq->active)
-		__set_bit(p->prio, rq->dyn_bitmap);
-	else
-		__set_bit(p->prio + PRIO_RANGE, rq->dyn_bitmap);
-}
-
-static inline void set_queue_bits(struct rq *rq, struct task_struct *p)
-{
-	__set_bit(p->static_prio, rq->static_bitmap);
-	set_dynamic_bit(p, rq);
+	__set_bit(p->prio, p->array->prio_bitmap);
 }
 
 /*
@@ -688,36 +682,52 @@ static inline void set_queue_bits(struct
 static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
 	list_del_init(&p->run_list);
-	if (!--rq->prio_queued[p->static_prio])
-		__clear_bit(p->static_prio, rq->static_bitmap);
-	if (list_empty(p->array->queue + p->prio)) {
-		int bitmap_prio = p->prio;
-
-		if (p->array == rq->expired)
-			bitmap_prio += PRIO_RANGE;
-		__clear_bit(bitmap_prio, rq->dyn_bitmap);
-	}
+	if (list_empty(p->array->queue + p->prio))
+		__clear_bit(p->prio, p->array->prio_bitmap);
 }
 
 /*
  * The task is being queued on a fresh array so it has its entitlement
- * bitmap cleared.
+ * bitmap set to the relevant matrix entry.
  */
 static inline void task_new_array(struct task_struct *p, struct rq *rq)
 {
-	bitmap_zero(p->bitmap, PRIO_RANGE);
+	bitmap_copy(p->bitmap, prio_matrix[USER_PRIO(p->static_prio)],
+		    PRIO_RANGE);
 	p->rotation = rq->prio_rotation;
 }
 
-static inline void queue_expired(struct task_struct *p, struct rq *rq)
+static inline int first_prio_slot(int prio)
+{
+	return SCHED_PRIO(find_first_zero_bit(prio_matrix[USER_PRIO(prio)],
+			  PRIO_RANGE));
+}
+
+static void queue_expired(struct task_struct *p, struct rq *rq)
 {
-	p->prio = p->normal_prio = p->static_prio;
+	p->prio = p->normal_prio = first_prio_slot(p->static_prio);
 	p->array = rq->expired;
-	bitmap_zero(p->bitmap, PRIO_RANGE);
-	p->rotation = rq->prio_rotation;
+	task_new_array(p, rq);
 	p->time_slice = p->quota;
 }
 
+/*
+ * SCHED_BATCH tasks never start at better priority than any other
+ * task that is already running since they are flagged as latency
+ * insensitive. This means they never cause greater latencies in other
+ * non SCHED_BATCH tasks of the same nice level. Alternatively, tasks
+ * that have been flagged as being awoken by system local resources
+ * continue only at the rq->prio_level to prevent them getting an
+ * unfair share of resources.
+ */
+static void fill_slots(struct task_struct *p, struct rq *rq)
+{
+	bitmap_fill(p->bitmap, USER_PRIO(rq->prio_level));
+	p->flags &= ~PF_SYS_WAKE;
+	if (p->prio < rq->prio_level && p->array == rq->active)
+		p->prio = rq->prio_level;
+}
+
 #define rq_quota(rq, prio)	((rq)->prio_quota[USER_PRIO(prio)])
 /*
  * recalc_task_prio determines what prio a non rt_task will be
@@ -733,8 +743,10 @@ static inline void queue_expired(struct 
 static void recalc_task_prio(struct task_struct *p, struct rq *rq)
 {
 	struct prio_array *array = rq->active;
-	int queue_prio, search_prio;
+	int queue_prio;
 
+	if (unlikely(p->policy == SCHED_BATCH || p->flags & PF_SYS_WAKE))
+		fill_slots(p, rq);
 	if (p->rotation == rq->prio_rotation) {
 		if (p->array == array) {
 			if (p->time_slice && rq_quota(rq, p->prio))
@@ -746,18 +758,8 @@ static void recalc_task_prio(struct task
 			task_new_array(p, rq);
 	} else
 		task_new_array(p, rq);
-	search_prio = p->static_prio;
 
-	/*
-	 * SCHED_BATCH tasks never start at better priority than any other
-	 * task that is already running since they are flagged as latency
-	 * insensitive. This means they never cause greater latencies in other
-	 * non SCHED_BATCH tasks of the same nice level.
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		search_prio = max(p->static_prio, rq->prio_level);
-	queue_prio = SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
-		     USER_PRIO(search_prio)));
+	queue_prio = SCHED_PRIO(find_first_zero_bit(p->bitmap, PRIO_RANGE));
 	if (queue_prio == MAX_PRIO) {
 		queue_expired(p, rq);
 		return;
@@ -772,9 +774,7 @@ static void recalc_task_prio(struct task
  * Adding to a runqueue. The dynamic priority queue that it is added to is
  * determined by the priority rotation of the runqueue it is being added to
  * and the quota still available in the task in p->bitmap and p->time_slice
- * (see recalc_task_prio above). The rq static_bitmap stores a list of
- * the static priorities, and prio_queued the number of tasks stored at each
- * p->static_prio level.
+ * (see recalc_task_prio above).
  */
 static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
 {
@@ -782,10 +782,9 @@ static inline void __enqueue_task(struct
 		p->array = rq->active;
 	else
 		recalc_task_prio(p, rq);
-	rq->prio_queued[p->static_prio]++;
 
 	sched_info_queued(p);
-	set_queue_bits(rq, p);
+	set_dynamic_bit(p, rq);
 }
 
 static void enqueue_task(struct task_struct *p, struct rq *rq)
@@ -807,13 +806,12 @@ static inline void enqueue_task_head(str
 static void requeue_task(struct task_struct *p, struct rq *rq,
 			 struct prio_array *old_array, int old_prio)
 {
+	if (p->array == rq->expired)
+		queue_expired(p, rq);
 	list_move_tail(&p->run_list, p->array->queue + p->prio);
 	if (!rt_task(p)) {
-		if (list_empty(old_array->queue + old_prio)) {
-			if (old_array == rq->expired)
-				old_prio += PRIO_RANGE;
-			__clear_bit(old_prio, rq->dyn_bitmap);
-		}
+		if (list_empty(old_array->queue + old_prio))
+			__clear_bit(old_prio, p->array->prio_bitmap);
 		set_dynamic_bit(p, rq);
 	}
 }
@@ -836,7 +834,7 @@ static inline unsigned int task_timeslic
 	unsigned int slice, rr;
 
 	slice = rr = p->quota;
-	if (likely(!rt_task(p)))
+	if (!rt_task(p))
 		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
 	return slice;
 }
@@ -851,7 +849,7 @@ static inline unsigned int task_timeslic
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define TASK_LOAD_WEIGHT(p)	LOAD_WEIGHT(task_timeslice(p))
 #define RTPRIO_TO_LOAD_WEIGHT(rp)	\
-	(LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
+	(LOAD_WEIGHT((rr_interval + 20 + (rp))))
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -941,17 +939,22 @@ static int effective_prio(struct task_st
 }
 
 /*
- * All tasks have quotas based on RR_INTERVAL. From nice 0 to 19 they are
- * all equal to it and below zero they get progressively larger making their
- * effective quota significantly larger. rt tasks all get RR_INTERVAL.
- */
-static unsigned int rr_interval(struct task_struct *p)
-{
-	int nice = TASK_NICE(p);
-
-	if (nice < 0 && !rt_task(p))
-		return RR_INTERVAL * (20 - nice) / 20;
-	return RR_INTERVAL;
+ * All tasks have quotas based on rr_interval. From nice 0 to 19 they are
+ * all equal to it and below zero they get exponentially larger making their
+ * effective quota significantly larger. rt tasks all get rr_interval.
+ * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval
+ * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0
+ * similar to the ratios between 0 and +19.
+ */
+static unsigned int rr_quota(struct task_struct *p)
+{
+	int neg_nice = -TASK_NICE(p), rr = rr_interval;
+
+	if (neg_nice > 6 && !rt_task(p)) {
+		rr *= neg_nice * neg_nice;
+		rr /= 40;
+	}
+	return rr;
 }
 
 /*
@@ -981,7 +984,7 @@ static void activate_task(struct task_st
 				     (now - p->timestamp) >> 20);
 	}
 
-	p->quota = rr_interval(p);
+	p->quota = rr_quota(p);
 	p->prio = effective_prio(p);
 	p->timestamp = now;
 	__activate_task(p, rq);
@@ -1550,8 +1553,18 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE)
+	/*
+	 * Tasks that are woken up from system-local resources (eg pipes)
+	 * are flagged as significant cpu may have been used elsewhere on
+	 * their behalf prior to their wakeup - see recalc_task_prio().
+	 */
+	if (old_state == TASK_UNINTERRUPTIBLE) {
 		rq->nr_uninterruptible--;
+		p->flags |= PF_SYS_WAKE;
+	} else {
+		if (old_state & TASK_NONINTERACTIVE)
+			p->flags |= PF_SYS_WAKE;
+	}
 
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
@@ -1672,6 +1685,7 @@ void fastcall wake_up_new_task(struct ta
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
+	p->flags |= PF_SYS_WAKE;
 
 	if (likely(cpu == this_cpu)) {
 		activate_task(p, rq, 1);
@@ -2124,6 +2138,17 @@ int can_migrate_task(struct task_struct 
 	return 1;
 }
 
+static inline int rq_best_prio(struct rq *rq)
+{
+	int best_prio, exp_prio;
+
+	best_prio = sched_find_first_bit(rq->dyn_bitmap);
+	exp_prio = sched_find_first_bit(rq->exp_bitmap);
+	if (unlikely(best_prio > exp_prio))
+		best_prio = exp_prio;
+	return best_prio;
+}
+
 /*
  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
  * load from busiest to this_rq, as part of a balancing operation within
@@ -2136,7 +2161,7 @@ static int move_tasks(struct rq *this_rq
 		      struct sched_domain *sd, enum idle_type idle,
 		      int *all_pinned)
 {
-	int idx, test_idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
 	    best_prio_seen, skip_for_load;
 	struct prio_array *array;
 	struct list_head *head, *curr;
@@ -2148,8 +2173,8 @@ static int move_tasks(struct rq *this_rq
 
 	rem_load_move = max_load_move;
 	pinned = 1;
-	this_best_prio = this_rq->curr->prio;
-	best_prio = busiest->curr->prio;
+	this_best_prio = rq_best_prio(this_rq);
+	best_prio = rq_best_prio(busiest);
 	/*
 	 * Enable handling of the case where there is more than one task
 	 * with the best priority.   If the current running task is one
@@ -2163,33 +2188,27 @@ static int move_tasks(struct rq *this_rq
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them. This is done by starting the search at priority
-	 * MAX_PRIO since expired bits are MAX_PRIO...MAX_DYN_PRIO-1
+	 * on them.
 	 */
 	array = busiest->expired;
-	test_idx = MAX_PRIO;
+new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
 skip_bitmap:
-	if (!test_idx)
-		idx = sched_find_first_bit(busiest->dyn_bitmap);
+	if (!idx)
+		idx = sched_find_first_bit(array->prio_bitmap);
 	else
-		idx = find_next_bit(busiest->dyn_bitmap, MAX_DYN_PRIO,
-		      test_idx);
-	if (idx >= MAX_DYN_PRIO) {
+		idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
 		if (array == busiest->expired) {
 			array = busiest->active;
-			test_idx = 0;
-			goto skip_bitmap;
+			goto new_array;
 		}
 		goto out;
 	}
-	test_idx = idx;
-	if (idx >= MAX_PRIO) {
-		if (array == busiest->active)
-			goto out;
-		idx -= PRIO_RANGE;
-	}
-	if (list_empty(array->queue + idx)) {
-		__clear_bit(test_idx, busiest->dyn_bitmap);
+
+	if (unlikely(list_empty(array->queue + idx))) {
+		__clear_bit(idx, array->prio_bitmap);
 		goto skip_bitmap;
 	}
 
@@ -2214,7 +2233,7 @@ skip_queue:
 		best_prio_seen |= idx == best_prio;
 		if (curr != head)
 			goto skip_queue;
-		test_idx++;
+		idx++;
 		goto skip_bitmap;
 	}
 
@@ -2231,7 +2250,7 @@ skip_queue:
 			this_best_prio = idx;
 		if (curr != head)
 			goto skip_queue;
-		test_idx++;
+		idx++;
 		goto skip_bitmap;
 	}
 out:
@@ -3305,10 +3324,9 @@ static inline void major_prio_rotation(s
 
 	rq->expired = rq->active;
 	rq->active = new_array;
+	rq->exp_bitmap = rq->expired->prio_bitmap;
+	rq->dyn_bitmap = rq->active->prio_bitmap;
 	rq->prio_rotation++;
-	bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
-	bitmap_copy(rq->dyn_bitmap, rq->static_bitmap, MAX_PRIO);
-	__set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
 }
 
 /*
@@ -3339,14 +3357,12 @@ static inline void rotate_runqueue_prior
 		struct prio_array *new_queue = rq->expired;
 
 		/*
-		 * The static_bitmap gives us the highest p->static prio task
-		 * that is queued. This value is used as the prio after
-		 * the major rotation and all tasks remaining on this
-		 * active queue are moved there. This means tasks can end
-		 * up a p->prio better than their p->static_prio.
+		 * On a major rotation we move everything remaining to best
+		 * priority on the new array. The priority matrix bitmap will
+		 * ensure tasks only get the slots each static priority
+		 * deserves.
 		 */
-		new_prio_level = find_next_bit(rq->static_bitmap, MAX_PRIO,
-				 MAX_RT_PRIO);
+		new_prio_level = MAX_RT_PRIO;
 		if (!list_empty(array->queue + rq->prio_level)) {
 			list_splice_tail_init(array->queue + rq->prio_level,
 					 new_queue->queue + new_prio_level);
@@ -3471,41 +3487,6 @@ EXPORT_SYMBOL(sub_preempt_count);
 #endif
 
 /*
- * Leave this debugging in until we are certain all bitmap manipulations are
- * working as desired since we can safely get out of this situation.
- */
-static noinline int rq_bitmap_error(struct rq *rq)
-{
-	static int bitmap_error = 0;
-	struct prio_array *array;
-	struct list_head *queue;
-	int idx, test_idx;
-
-	printk(KERN_ERR
-	       "SCHEDULER BITMAP ERROR %d - attempting to reconstruct...\n",
-	       ++bitmap_error);
-	for (test_idx = MAX_RT_PRIO ; test_idx < MAX_DYN_PRIO ; test_idx++) {
-		if (test_idx < MAX_PRIO) {
-			idx = test_idx;
-			array = rq->active;
-		} else {
-			idx = test_idx - PRIO_RANGE;
-			array = rq->expired;
-		}
-		queue = array->queue + idx;
-		if (!list_empty(queue)) {
-			if (!test_bit(test_idx, rq->dyn_bitmap)) {
-				__set_bit(test_idx, rq->dyn_bitmap);
-			}
-		}
-	}
-	idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, MAX_RT_PRIO);
-	/* We hit a real bug. There is no way out of this */
-	BUG_ON(idx == MAX_DYN_PRIO);
-	return idx;
-}
-
-/*
  * next_dynamic_task finds the next suitable dynamic task. As the dyn_bitmap
  * contains all the active and expired dynamic tasks sequentially we only
  * need to do one bitmap lookup.
@@ -3515,18 +3496,18 @@ static inline struct task_struct *next_d
 	struct task_struct *next;
 	struct list_head *queue;
 	struct prio_array *array = rq->active;
+	int expirations = 0;
 
 retry:
-	if (unlikely(idx == MAX_DYN_PRIO))
-		idx = rq_bitmap_error(rq);
 	if (idx >= MAX_PRIO) {
+		BUG_ON(++expirations > 1);
 		/*
 		 * We have selected a bit from the expired range so there are
 		 * no more tasks in the active array.
 		 */
 		major_prio_rotation(rq);
 		array = rq->active;
-		idx -= PRIO_RANGE;
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
 	}
 	if (unlikely(list_empty(array->queue + idx))) {
 		/*
@@ -3536,7 +3517,7 @@ retry:
 		 * interim. A very rare occurrence.
 		 */
 		__clear_bit(idx, rq->dyn_bitmap);
-		idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, idx + 1);
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1);
 		goto retry;
 	}
 	queue = array->queue + idx;
@@ -4164,6 +4145,7 @@ void set_user_nice(struct task_struct *p
 			resched_task(rq->curr);
 	}
 out_unlock:
+	p->quota = rr_quota(p);
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
@@ -7065,6 +7047,26 @@ void __init sched_init(void)
 {
 	int i, j, k;
 	int highest_cpu = 0;
+	unsigned int rr_us = 0, rr_inc = RR_INTERVAL * 1000;
+
+	/* Generate the priority matrix */
+	for (i = 0; i < PRIO_RANGE; i++) {
+		if (i < 20) {
+			bitmap_zero(prio_matrix[i] , PRIO_RANGE);
+			j = PRIO_RANGE * PRIO_RANGE / (i + 1);
+			for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j)
+				__set_bit(k / PRIO_RANGE, prio_matrix[i]);
+		} else if (i == 20) {
+			bitmap_fill(prio_matrix[i], PRIO_RANGE);
+			for (k = 1; k < PRIO_RANGE; k += 2)
+				__clear_bit(k, prio_matrix[i]);
+		} else {
+			bitmap_fill(prio_matrix[i], PRIO_RANGE);
+			j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i + 1);
+			for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j)
+				__clear_bit(k / PRIO_RANGE, prio_matrix[i]);
+		}
+	}
 
 	for_each_possible_cpu(i) {
 		struct prio_array *array;
@@ -7078,6 +7080,8 @@ void __init sched_init(void)
 		rq->prio_level = MAX_RT_PRIO;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
+		rq->dyn_bitmap = rq->active->prio_bitmap;
+		rq->exp_bitmap = rq->expired->prio_bitmap;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
@@ -7096,15 +7100,19 @@ void __init sched_init(void)
 			array = rq->arrays + j;
 			for (k = 0; k < MAX_PRIO; k++)
 				INIT_LIST_HEAD(array->queue + k);
+			bitmap_zero(array->prio_bitmap, MAX_PRIO);
+			/* delimiter for bitsearch */
+			__set_bit(MAX_PRIO, array->prio_bitmap);
 		}
 		for (k = 0; k < PRIO_RANGE; k++)
 			rq->prio_quota[k] = 0;
-		bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
-		bitmap_zero(rq->static_bitmap, MAX_PRIO);
-		/* delimiter for bitsearch */
-		__set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
 		highest_cpu = i;
+
+		/* Every added cpu increases the rr_interval */
+		rr_us += rr_inc;
+		rr_inc /= 2;
 	}
+	rr_interval = rr_us / 1000 ? : 1;
 
 	set_load_weight(&init_task);
 
Index: linux-2.6.21-rc3-mm2/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc3-mm2.orig/include/linux/sched.h	2007-03-15 22:33:06.000000000 +1100
+++ linux-2.6.21-rc3-mm2/include/linux/sched.h	2007-03-15 22:41:18.000000000 +1100
@@ -1200,6 +1200,7 @@ static inline void put_task_struct(struc
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_SYS_WAKE	0x04000000	/* Task was waiting on system resource */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
