Modify the priority bitmaps of different nice levels to be dithered
minimising the latency likely when different nice levels are used. This
allows low cpu using relatively niced tasks to still get low latency in the
presence of less niced tasks.

Fix the accounting on -nice levels to not be scaled by HZ.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

---
 kernel/sched.c |  140 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 100 insertions(+), 40 deletions(-)

Index: linux-2.6.21-rc3-mm2/kernel/sched.c
===================================================================
--- linux-2.6.21-rc3-mm2.orig/kernel/sched.c	2007-03-13 23:17:29.000000000 +1100
+++ linux-2.6.21-rc3-mm2/kernel/sched.c	2007-03-15 01:31:08.000000000 +1100
@@ -88,25 +88,40 @@ unsigned long long __attribute__((weak))
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
 #define MAX_DYN_PRIO		(MAX_PRIO + PRIO_RANGE)
+#define EXP_BITSIZE		(MAX_RT_PRIO + PRIO_RANGE / 2 + 1)
 
-/*
- * Preemption needs to take into account that a low priority task can be
- * at a higher prio due to list merging. Its priority is artificially
- * elevated and it should be preempted if anything higher priority wakes up
- * provided it is not a realtime comparison.
- */
-#define TASK_PREEMPTS_CURR(p, curr) \
-	(((p)->prio < (curr)->prio) || (!rt_task(p) && \
-		((p)->static_prio < (curr)->static_prio && \
-			((curr)->static_prio > (curr)->prio))))
+#define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
 /*
  * This is the time all tasks within the same priority round robin.
  * Set to a minimum of 6ms.
  */
-#define RR_INTERVAL		((6 * HZ / 1001) + 1)
+#define __RR_INTERVAL		6
+#define RR_INTERVAL		((__RR_INTERVAL * HZ / 1001) + 1)
 #define DEF_TIMESLICE		(RR_INTERVAL * 20)
 
+/*
+ * This contains a bitmap for each dynamic priority level with empty slots
+ * for the valid priorities each different nice level can have. It allows
+ * us to stagger the slots where differing priorities run in a way that
+ * keeps latency differences between different nice levels at a minimum.
+ * ie, where 0 means a slot for that priority, priority running from left to
+ * right:
+ * nice -20 0000000000000000000000000000000000000000
+ * nice -10 1001000100100010001001000100010010001000
+ * nice   0 0101010101010101010101010101010101010101
+ * nice   5 1101011010110101101011010110101101011011
+ * nice  10 0110111011011101110110111011101101110111
+ * nice  15 0111110111111011111101111101111110111111
+ * nice  19 1111111111111111111011111111111111111111
+  */
+struct prio_matrix {
+	unsigned long matrix_bitmap[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)];
+	int first_slot[PRIO_RANGE];
+};
+
+static struct prio_matrix prio_matrix __read_mostly;
+
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -183,6 +198,13 @@ struct rq {
 	 * false positives. Include 1 bit for delimiter.
 	 */
 
+	DECLARE_BITMAP(exp_bitmap, EXP_BITSIZE);
+	/*
+	 * This bitmap contains the dynamic bits that will be used when
+	 * generating a new bitmap on array swapping. The size is chosen for
+	 * the fastest copy function rather than the smallest bitmap.
+	 */
+
 	DECLARE_BITMAP(static_bitmap, MAX_PRIO);
 	/* The bitmap of all static priorities queued */
 
@@ -648,29 +670,33 @@ static inline int task_queued(struct tas
 
 static inline void set_task_entitlement(struct task_struct *p)
 {
-	__set_bit(USER_PRIO(p->prio), p->bitmap);
-
-	/*
-	 * In the case this task has been part of a merged list that has
-	 * made it to higher priority than it should be, we remove the
-	 * quota from its own priority since it will get a quota at this
-	 * priority.
-	 */
-	if (p->normal_prio < p->static_prio)
-		__set_bit(USER_PRIO(p->static_prio), p->bitmap);
+	if (__test_and_set_bit(USER_PRIO(p->prio), p->bitmap)) {
+		int first_slot = find_first_zero_bit(p->bitmap, PRIO_RANGE);
+		/*
+		 * If the bit is already set then this task has been queued
+		 * at this priority by queue merging. For fair accounting we
+		 * set one of the unused slots if available.
+		 */
+		 if (first_slot < PRIO_RANGE)
+		 	__set_bit(first_slot, p->bitmap);
+	}
 	p->time_slice = p->quota;
 }
 
 /*
  * Only the static_bitmap has hard accounting. The dynamic bits can have
- * false positives. rt_tasks can only be on the active queue.
+ * false positives. rt_tasks can only be on the active queue. We set the
+ * bit on the expired bitmap in liew of where this running task will be
+ * after a swap.
  */
 static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
 {
 	if (p->array == rq->active)
 		__set_bit(p->prio, rq->dyn_bitmap);
-	else
+	else {
 		__set_bit(p->prio + PRIO_RANGE, rq->dyn_bitmap);
+		__set_bit(p->prio, rq->exp_bitmap);
+	}
 }
 
 static inline void set_queue_bits(struct rq *rq, struct task_struct *p)
@@ -701,20 +727,27 @@ static void dequeue_task(struct task_str
 
 /*
  * The task is being queued on a fresh array so it has its entitlement
- * bitmap cleared.
+ * bitmap set to the relevant matrix entry.
  */
 static inline void task_new_array(struct task_struct *p, struct rq *rq)
 {
-	bitmap_zero(p->bitmap, PRIO_RANGE);
+	bitmap_copy(p->bitmap,
+		    prio_matrix.matrix_bitmap[USER_PRIO(p->static_prio)],
+		    PRIO_RANGE);
 	p->rotation = rq->prio_rotation;
 }
 
-static inline void queue_expired(struct task_struct *p, struct rq *rq)
+static inline int first_prio_slot(int prio)
 {
-	p->prio = p->normal_prio = p->static_prio;
+	return SCHED_PRIO(find_first_zero_bit(
+		prio_matrix.matrix_bitmap[USER_PRIO(prio)], PRIO_RANGE));
+}
+
+static void queue_expired(struct task_struct *p, struct rq *rq)
+{
+	p->prio = p->normal_prio = first_prio_slot(p->static_prio);
 	p->array = rq->expired;
-	bitmap_zero(p->bitmap, PRIO_RANGE);
-	p->rotation = rq->prio_rotation;
+	task_new_array(p, rq);
 	p->time_slice = p->quota;
 }
 
@@ -746,7 +779,7 @@ static void recalc_task_prio(struct task
 			task_new_array(p, rq);
 	} else
 		task_new_array(p, rq);
-	search_prio = p->static_prio;
+	search_prio = MAX_RT_PRIO;
 
 	/*
 	 * SCHED_BATCH tasks never start at better priority than any other
@@ -755,7 +788,7 @@ static void recalc_task_prio(struct task
 	 * non SCHED_BATCH tasks of the same nice level.
 	 */
 	if (unlikely(p->policy == SCHED_BATCH))
-		search_prio = max(p->static_prio, rq->prio_level);
+		search_prio = rq->prio_level;
 	queue_prio = SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
 		     USER_PRIO(search_prio)));
 	if (queue_prio == MAX_PRIO) {
@@ -807,6 +840,8 @@ static inline void enqueue_task_head(str
 static void requeue_task(struct task_struct *p, struct rq *rq,
 			 struct prio_array *old_array, int old_prio)
 {
+	if (p->array == rq->expired)
+		queue_expired(p, rq);
 	list_move_tail(&p->run_list, p->array->queue + p->prio);
 	if (!rt_task(p)) {
 		if (list_empty(old_array->queue + old_prio)) {
@@ -836,7 +871,7 @@ static inline unsigned int task_timeslic
 	unsigned int slice, rr;
 
 	slice = rr = p->quota;
-	if (likely(!rt_task(p)))
+	if (!rt_task(p))
 		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
 	return slice;
 }
@@ -3307,8 +3342,9 @@ static inline void major_prio_rotation(s
 	rq->active = new_array;
 	rq->prio_rotation++;
 	bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
-	bitmap_copy(rq->dyn_bitmap, rq->static_bitmap, MAX_PRIO);
+	bitmap_copy(rq->dyn_bitmap, rq->exp_bitmap, EXP_BITSIZE);
 	__set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
+	bitmap_zero(rq->exp_bitmap, EXP_BITSIZE);
 }
 
 /*
@@ -3339,14 +3375,12 @@ static inline void rotate_runqueue_prior
 		struct prio_array *new_queue = rq->expired;
 
 		/*
-		 * The static_bitmap gives us the highest p->static prio task
-		 * that is queued. This value is used as the prio after
-		 * the major rotation and all tasks remaining on this
-		 * active queue are moved there. This means tasks can end
-		 * up a p->prio better than their p->static_prio.
+		 * On a major rotation we move everything remaining to best
+		 * priority on the new array. The priority matrix bitmap will
+		 * ensure tasks only get the slots each static priority
+		 * deserves.
 		 */
-		new_prio_level = find_next_bit(rq->static_bitmap, MAX_PRIO,
-				 MAX_RT_PRIO);
+		new_prio_level = MAX_RT_PRIO;
 		if (!list_empty(array->queue + rq->prio_level)) {
 			list_splice_tail_init(array->queue + rq->prio_level,
 					 new_queue->queue + new_prio_level);
@@ -7066,6 +7100,31 @@ void __init sched_init(void)
 	int i, j, k;
 	int highest_cpu = 0;
 
+	/* Generate the priority matrix */
+	for (i = 0; i < PRIO_RANGE; i++) {
+		struct prio_matrix *pm = &prio_matrix;
+
+		if (i < 20) {
+			bitmap_zero(pm->matrix_bitmap[i] , PRIO_RANGE);
+			j = PRIO_RANGE * PRIO_RANGE / (i + 1);
+			for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j) {
+				__set_bit(k / PRIO_RANGE,
+					  pm->matrix_bitmap[i]);
+			}
+		} else if (i == 20) {
+			bitmap_fill(pm->matrix_bitmap[i], PRIO_RANGE);
+			for (k = 1; k < PRIO_RANGE; k += 2)
+				__clear_bit(k, pm->matrix_bitmap[i]);
+		} else {
+			bitmap_fill(pm->matrix_bitmap[i], PRIO_RANGE);
+			j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i + 1);
+			for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j) {
+				__clear_bit(k / PRIO_RANGE,
+					    pm->matrix_bitmap[i]);
+			}
+		}
+	}
+
 	for_each_possible_cpu(i) {
 		struct prio_array *array;
 		struct rq *rq;
@@ -7100,6 +7159,7 @@ void __init sched_init(void)
 		for (k = 0; k < PRIO_RANGE; k++)
 			rq->prio_quota[k] = 0;
 		bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
+		bitmap_zero(rq->exp_bitmap, EXP_BITSIZE);
 		bitmap_zero(rq->static_bitmap, MAX_PRIO);
 		/* delimiter for bitsearch */
 		__set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
