---
 Documentation/sched-design.txt |  128 ++++--------
 include/linux/init_task.h      |    4 
 include/linux/list.h           |   42 ----
 include/linux/sched.h          |    9 
 kernel/sched.c                 |  418 ++++++++++++-----------------------------
 5 files changed, 175 insertions(+), 426 deletions(-)

Index: linux-2.6.21-rc4-rsdl/Documentation/sched-design.txt
===================================================================
--- linux-2.6.21-rc4-rsdl.orig/Documentation/sched-design.txt	2007-03-27 23:28:50.000000000 +1000
+++ linux-2.6.21-rc4-rsdl/Documentation/sched-design.txt	2007-03-27 23:29:15.000000000 +1000
@@ -1,14 +1,14 @@
  Goals, Design and Implementation of the ultra-scalable O(1) scheduler by
- Ingo Molnar and the Rotating Staircase Deadline cpu scheduler policy
- designed by Con Kolivas.
+ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by
+ Con Kolivas.
 
 
   This was originally an edited version of an email Ingo Molnar sent to
   lkml on 4 Jan 2002.  It describes the goals, design, and implementation
   of Ingo's ultra-scalable O(1) scheduler. It now contains a description
-  of the Rotating Staircase Deadline priority scheduler that was built on
-  this design.
-  Last Updated: Sun Feb 25 2007
+  of the Staircase Deadline priority scheduler that was built on this
+  design.
+  Last Updated: Tue Mar 27 2007
 
 
 Goal
@@ -168,15 +168,15 @@ code is smaller than the old one.
 	Ingo
 
 
-Rotating Staircase Deadline cpu scheduler policy
+Staircase Deadline cpu scheduler policy
 ================================================
 
 Design summary
 ==============
 
 A novel design which incorporates a foreground-background descending priority
-system (the staircase) with runqueue managed minor and major epochs (rotation
-and deadline).
+system (the staircase) via a bandwidth allocation matrix according to nice
+level.
 
 
 Features
@@ -196,113 +196,66 @@ constraints of strict fairness.
 Design description
 ==================
 
-RSDL works off the principle of providing each task a quota of runtime that it
-is allowed to run at a number of priority levels determined by its static
-priority (ie. its nice level). When each task is queued, the cpu that it is
-queued onto also keeps a record of that quota. If the task uses up its quota it
-has its priority decremented to the next level. Also, if the cpu notices a quota
-full has been used for that priority level, it pushes everything remaining at
-that priority level to the next lowest priority level. Once every runtime quota
-has been consumed of every priority level, a task is queued on the "expired"
-array. When no other tasks exist with quota, the expired array is activated and
-fresh quotas are handed out. This is all done in O(1).
-
+SD works off the principle of providing each task a quota of runtime that it is
+allowed to run at a number of priority levels determined by its static priority
+(ie. its nice level). If the task uses up its quota it has its priority
+decremented to the next level determined by a priority matrix. Once every
+runtime quota has been consumed of every priority level, a task is queued on the
+"expired" array. When no other tasks exist with quota, the expired array is
+activated and fresh quotas are handed out. This is all done in O(1).
 
 Design details
 ==============
 
-Each cpu has its own runqueue which micromanages its own epochs, and each
-task keeps a record of its own entitlement of cpu time. Most of the rest
-of these details apply to non-realtime tasks as rt task management is
-straight forward.
+Each task keeps a record of its own entitlement of cpu time. Most of the rest of
+these details apply to non-realtime tasks as rt task management is straight
+forward.
 
 Each runqueue keeps a record of what major epoch it is up to in the
 rq->prio_rotation field which is incremented on each major epoch. It also
-keeps a record of quota available to each priority value valid for that
-major epoch in rq->prio_quota[].
+keeps a record of the current prio_level for each static priority task.
 
 Each task keeps a record of what major runqueue epoch it was last running
 on in p->rotation. It also keeps a record of what priority levels it has
 already been allocated quota from during this epoch in a bitmap p->bitmap.
 
 The only tunable that determines all other details is the RR_INTERVAL. This
-is set to 8ms (minimum on 1000HZ, higher at different HZ values), and is
-scaled gently upwards with more cpus.
+is set to 8ms, and is scaled gently upwards with more cpus. This value is
+tunable via a /proc interface.
 
 All tasks are initially given a quota based on RR_INTERVAL. This is equal to
-RR_INTERVAL between nice values of 0 and 19, and progressively larger for nice
-values from -1 to -20. This is to maintain a relationship of nice 19 having
-approximately 1/20th of the cpu of nice 0, and nice 0 having 1/20th the cpu of
-nice -20. This is assigned to p->quota and only changes with changes in nice
-level.
+RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and
+progressively larger for nice values from -1 to -20. This is assigned to
+p->quota and only changes with changes in nice level.
 
 As a task is first queued, it checks in recalc_task_prio to see if it has run at
 this runqueue's current priority rotation. If it has not, it will have its
 p->prio level set according to the first slot in a "priority matrix" and will be
 given a p->time_slice equal to the p->quota, and has its allocation bitmap bit
-set in p->bitmap for this prio level. This quota is then also added to the
-current runqueue's rq->prio_quota[p->prio]. It is then queued on the current
-active priority array.
+set in p->bitmap for this prio level. It is then queued on the current active
+priority array.
 
 If a task has already been running during this major epoch, and it has
 p->time_slice left and the rq->prio_quota for the task's p->prio still
 has quota, it will be placed back on the active array, but no more quota
-will be added to either the task or the runqueue quota.
+will be added.
 
 If a task has been running during this major epoch, but does not have
-p->time_slice left or the runqueue's prio_quota for this task's p->prio
-does not have quota, it will find the next lowest priority in its bitmap
-that it has not been allocated quota from. It then gets the a full quota
-in p->time_slice and adds that to the quota value for the relevant priority
-rq->prio_quota. It is then queued on the current active priority array at
-the newly determined lower priority.
+p->time_slice left, it will find the next lowest priority in its bitmap that it
+has not been allocated quota from. It then gets the a full quota in
+p->time_slice. It is then queued on the current active priority array at the
+newly determined lower priority.
 
 If a task has been running during this major epoch, and does not have
 any entitlement left in p->bitmap and no time_slice left, it will have its
-bitmap cleared, and be queued at its p->static_prio again, but on the expired
-priority array. No quota will be allocated until this task is scheduled.
+bitmap cleared, and be queued at its best prio again, but on the expired
+priority array.
 
 When a task is queued, it has its relevant bit set in the array->prio_bitmap.
 
-During a scheduler_tick where a task is running, the p->time_slice is
-decremented, and if it reaches zero then the recalc_task_prio is readjusted
-and the task rescheduled.
-
-During a task running tick, the runqueue prio_quota is also decremented. If
-it empties then a priority rotation occurs (a major or minor epoch). If the
-current runqueue's priority level is better than that of nice 19 tasks, a
-minor rotation is performed, otherwise a major rotation will occur.
-
-A minor rotation takes the remaining tasks at this priority level queue and
-merges them with a list_splice_tail with the queue from the next lowest
-priority level. At this time, any tasks that have been merged will now
-have invalid values in p->prio so this must be considered when dequeueing
-and scheduling the task.
-
-A major rotation takes the remaining tasks at this priority level queue and
-merges them with a list_splice_tail with the best priority task running on
-the expired array, and swaps the priority arrays. The priority quotas are
-reset at this time. Any tasks that have been merged will now have invalid
-values in p->array and possibly p->prio so this must be considered. The
-rq->prio_rotation is incremented at this time.
-
-When a task is dequeued, the dyn_bitmap bit is unset only after testing
-that the relevant queue is actually empty since p->prio may be inaccurate
-and no hard accounting of the number of tasks at that level is possible.
-
-When selecting a new task for scheduling, after the first dynamic bit is found
-on the dyn_bitmap, it is checked to see that a task is really queued at that
-priority or if it is a false positive due to the task being dequeued at a time
-when its p->prio does not match which queue it is on after some form of priority
-rotation. This is a rare occurrence as it tends to only occur if a task that is
-already waiting on a runqueue gets dequeued. If no tasks remain on the active
-array, a major priority rotation is performed. If the chosen task has not been
-running during this major or minor rotation it has new quota allocated at this
-time, and added to the runqueue's quota.
-
-If a task finds itself merged at a priority level that it does not normally
-receive quota at (due to list merging) it will remove one of its normal
-priority slots to compensate.
+p->time_slice is stored in nanosconds and is updated via update_cpu_clock on
+schedule() and scheduler_tick. If p->time_slice is below zero then the
+recalc_task_prio is readjusted and the task rescheduled.
 
 
 Priority Matrix
@@ -328,6 +281,10 @@ task only runs one slot per major rotati
 smallest possible maximum latencies between tasks of varying nice levels, thus
 allowing vastly different nice levels to be used.
 
+SCHED_BATCH tasks are managed slightly differently, receiving only the top
+slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but
+slightly higher latencies.
+
 
 Modelling deadline behaviour
 ============================
@@ -336,8 +293,7 @@ As the accounting in this design is hard
 calculations or interactivity modifiers, it is possible to accurately
 predict the maximum latency that a task may experience under different
 conditions. This is a virtual deadline mechanism enforced by mandatory
-runqueue epochs, and not by trying to keep complicated accounting of each
-task.
+timeslice expiration and not outside bandwidth measurement.
 
 The maximum duration a task can run during one major epoch is determined by its
 nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL
@@ -433,5 +389,5 @@ can be countered by running a gui at a n
 causing adversely large latencies in nice 0 tasks.
 
 
-Fri, 16 Mar 2007
+Tue Mar 27 2007
 Con Kolivas <kernel@kolivas.org>
Index: linux-2.6.21-rc4-rsdl/include/linux/init_task.h
===================================================================
--- linux-2.6.21-rc4-rsdl.orig/include/linux/init_task.h	2007-03-27 23:28:51.000000000 +1000
+++ linux-2.6.21-rc4-rsdl/include/linux/init_task.h	2007-03-27 23:29:15.000000000 +1000
@@ -109,8 +109,8 @@ extern struct group_info init_groups;
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
 	.ioprio		= 0,						\
-	.time_slice	= HZ,						\
-	.quota		= HZ,						\
+	.time_slice	= 1000000000,						\
+	.quota		= 1000000000,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
Index: linux-2.6.21-rc4-rsdl/include/linux/list.h
===================================================================
--- linux-2.6.21-rc4-rsdl.orig/include/linux/list.h	2007-03-27 23:28:51.000000000 +1000
+++ linux-2.6.21-rc4-rsdl/include/linux/list.h	2007-03-27 23:29:02.000000000 +1000
@@ -333,20 +333,6 @@ static inline void __list_splice(struct 
 	at->prev = last;
 }
 
-static inline void __list_splice_tail(struct list_head *list,
-				      struct list_head *head)
-{
-	struct list_head *first = list->next;
-	struct list_head *last = list->prev;
-	struct list_head *at = head->prev;
-
-	first->prev = at;
-	at->next = first;
-
-	last->next = head;
-	head->prev = last;
-}
-
 /**
  * list_splice - join two lists
  * @list: the new list to add.
@@ -359,18 +345,6 @@ static inline void list_splice(struct li
 }
 
 /**
- * list_splice_tail - join two lists at one's tail
- * @list: the new list to add.
- * @head: the place to add it in the first list.
- */
-static inline void list_splice_tail(struct list_head *list,
-				    struct list_head *head)
-{
-	if (!list_empty(list))
-		__list_splice_tail(list, head);
-}
-
-/**
  * list_splice_init - join two lists and reinitialise the emptied list.
  * @list: the new list to add.
  * @head: the place to add it in the first list.
@@ -443,22 +417,6 @@ static inline void list_splice_init_rcu(
 }
 
 /**
- * list_splice_tail_init - join 2 lists at one's tail & reinitialise emptied
- * @list: the new list to add.
- * @head: the place to add it in the first list.
- *
- * The list at @list is reinitialised
- */
-static inline void list_splice_tail_init(struct list_head *list,
-					 struct list_head *head)
-{
-	if (!list_empty(list)) {
-		__list_splice_tail(list, head);
-		INIT_LIST_HEAD(list);
-	}
-}
-
-/**
  * list_entry - get the struct for this entry
  * @ptr:	the &struct list_head pointer.
  * @type:	the type of the struct this is embedded in.
Index: linux-2.6.21-rc4-rsdl/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc4-rsdl.orig/include/linux/sched.h	2007-03-27 23:28:51.000000000 +1000
+++ linux-2.6.21-rc4-rsdl/include/linux/sched.h	2007-03-27 23:29:15.000000000 +1000
@@ -825,18 +825,15 @@ struct task_struct {
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice;
+	int time_slice;
 	/*
 	 * How much this task is entitled to run at the current priority
 	 * before being requeued at a lower priority.
 	 */
 	unsigned int first_time_slice;
 	/* Is this the very first time_slice this task has ever run. */
-	unsigned int quota;
-	/*
-	 * How much this task contributes to the current priority queue
-	 * length
-	 */
+	int quota;
+	/* How much this task receives at each priority level */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
Index: linux-2.6.21-rc4-rsdl/kernel/sched.c
===================================================================
--- linux-2.6.21-rc4-rsdl.orig/kernel/sched.c	2007-03-27 23:28:51.000000000 +1000
+++ linux-2.6.21-rc4-rsdl/kernel/sched.c	2007-03-27 23:29:15.000000000 +1000
@@ -16,8 +16,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2007-03-02	Rotating Staircase deadline scheduling policy by Con Kolivas
- *		RSDL v0.33
+ *  2007-03-02	Staircase deadline scheduling policy by Con Kolivas
  */
 
 #include <linux/mm.h>
@@ -87,11 +86,16 @@ unsigned long long __attribute__((weak))
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
 
+/* Some helpers for converting to/from nanosecond timing */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define NS_TO_MS(TIME)		((TIME) / 1000000)
+#define MS_TO_NS(TIME)		((TIME) * 1000000)
+
 #define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
 /*
  * This is the time all tasks within the same priority round robin.
- * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ.
+ * Value is in ms and set to a minimum of 8ms. Scales with number of cpus.
  * Tunable via /proc interface.
  */
 int rr_interval __read_mostly;
@@ -113,10 +117,12 @@ int rr_interval __read_mostly;
  * nice  10 0110111011011101110110111011101101110111
  * nice  15 0111110111111011111101111101111110111111
  * nice  19 1111111111111111111011111111111111111111
-  */
+ */
 static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
 				 __read_mostly;
 
+struct rq;
+
 /*
  * These are the runqueue data structures:
  */
@@ -126,9 +132,16 @@ struct prio_array {
 
 	DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
 	/*
-	 * The bitmap of priorities queued; The dynamic bits can have
-	 * false positives. Include 1 bit for delimiter.
+	 * The bitmap of priorities queued for this array. While the expired
+	 * array will never have realtime tasks on it, it is simpler to have
+	 * equal sized bitmaps for a cheap array swap. Include 1 bit for
+	 * delimiter.
 	 */
+
+#ifdef CONFIG_SMP
+	struct rq *rq;
+	/* For convenience looks back at rq */
+#endif
 };
 
 /*
@@ -166,12 +179,6 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	long prio_quota[PRIO_RANGE];
-	/*
-	 * The quota of ticks the runqueue runs at each dynamic priority
-	 * before cycling to the next priority.
-	 */
-
 	struct prio_array *active, *expired, arrays[2];
 	unsigned long *dyn_bitmap, *exp_bitmap;
 
@@ -626,26 +633,13 @@ static inline int task_queued(struct tas
 	return !list_empty(&task->run_list);
 }
 
-static inline void set_task_entitlement(struct task_struct *p)
-{
-	__set_bit(USER_PRIO(p->prio), p->bitmap);
-	p->time_slice = p->quota;
-}
-
-/*
- * There is no specific hard accounting. The dynamic bits can have
- * false positives. rt_tasks can only be on the active queue.
- */
 static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
 {
 	__set_bit(p->prio, p->array->prio_bitmap);
 }
 
 /*
- * Removing from a runqueue. While we don't know with absolute certainty
- * where this task really is, the p->array and p->prio are very likely
- * so we check that queue to see if we can clear that bit to take some
- * load off finding false positives in next_dynamic_task().
+ * Removing from a runqueue.
  */
 static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
@@ -662,38 +656,37 @@ static inline void task_new_array(struct
 {
 	bitmap_zero(p->bitmap, PRIO_RANGE);
 	p->rotation = rq->prio_rotation;
+	p->time_slice = p->quota;
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
 static inline int first_prio_slot(struct task_struct *p)
 {
+	if (unlikely(p->policy == SCHED_BATCH))
+		return p->static_prio;
 	return SCHED_PRIO(find_first_zero_bit(
 		prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
 }
 
-/* Is a dynamic_prio part of the allocated slots for this static_prio */
-static inline int entitled_slot(int static_prio, int dynamic_prio)
-{
-	return !test_bit(USER_PRIO(dynamic_prio),
-		prio_matrix[USER_PRIO(static_prio)]);
-}
-
 /*
  * Find the first unused slot by this task that is also in its prio_matrix
- * level. Ensure that the prio_level is not unnecessarily low by checking
- * that best_static_prio this major rotation was not a niced task.
- * SCHED_BATCH tasks do not perform this check so they do not induce
- * latencies in tasks of any nice level.
+ * level. SCHED_BATCH tasks do not use the priority matrix. They only take
+ * priority slots from their static_prio and above.
  */
 static inline int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
 	DECLARE_BITMAP(tmp, PRIO_RANGE);
 	int search_prio;
 
-	if (p->static_prio < rq->best_static_prio && p->policy != SCHED_BATCH)
+	if (p->static_prio < rq->best_static_prio)
 		search_prio = MAX_RT_PRIO;
 	else
 		search_prio = rq->prio_level;
+	if (unlikely(p->policy == SCHED_BATCH)) {
+		search_prio = max(search_prio, p->static_prio);
+		return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
+				  USER_PRIO(search_prio)));
+	}
 	bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)],
 		  PRIO_RANGE);
 	return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
@@ -706,30 +699,58 @@ static void queue_expired(struct task_st
 	task_new_array(p, rq);
 	p->prio = p->normal_prio = first_prio_slot(p);
 	p->time_slice = p->quota;
+	p->rotation = rq->prio_rotation;
 }
 
-#define rq_quota(rq, prio)	((rq)->prio_quota[USER_PRIO(prio)])
+#ifdef CONFIG_SMP
+/*
+ * If we're waking up a task that was previously on a different runqueue,
+ * update its data appropriately. Note we may be reading data from src_rq->
+ * outside of lock, but the occasional inaccurate result should be harmless.
+ */
+ static inline void update_if_moved(struct task_struct *p, struct rq *rq)
+{
+	struct rq *src_rq = p->array->rq;
+
+	if (src_rq == rq)
+		return;
+	if (p->rotation == src_rq->prio_rotation)
+		p->rotation = rq->prio_rotation;
+	else
+		p->rotation = 0;
+	if (p->array == src_rq->expired)
+		p->array = rq->expired;
+	else
+		p->array = rq->active;
+}
+#else
+static inline void update_if_moved(struct task_struct *p, struct rq *rq)
+{
+}
+#endif
 
 /*
- * recalc_task_prio determines what prio a non rt_task will be
+ * recalc_task_prio determines what priority a non rt_task will be
  * queued at. If the task has already been running during this runqueue's
  * major rotation (rq->prio_rotation) then it continues at the same
  * priority if it has tick entitlement left. If it does not have entitlement
  * left, it finds the next priority slot according to its nice value that it
  * has not extracted quota from. If it has not run during this major
- * rotation, it starts at its static priority and has its bitmap quota
+ * rotation, it starts at the next_entitled_slot and has its bitmap quota
  * cleared. If it does not have any slots left it has all its slots reset and
- * is queued on the expired at its static priority.
+ * is queued on the expired at its first_prio_slot.
  */
 static void recalc_task_prio(struct task_struct *p, struct rq *rq)
 {
 	struct prio_array *array = rq->active;
 	int queue_prio;
 
+	update_if_moved(p, rq);
 	if (p->rotation == rq->prio_rotation) {
 		if (p->array == array) {
-			if (p->time_slice && rq_quota(rq, p->prio))
+			if (p->time_slice > 0)
 				return;
+			p->time_slice = p->quota;
 		} else if (p->array == rq->expired) {
 			queue_expired(p, rq);
 			return;
@@ -743,17 +764,14 @@ static void recalc_task_prio(struct task
 		queue_expired(p, rq);
 		return;
 	}
-	rq_quota(rq, queue_prio) += p->quota;
 	p->prio = p->normal_prio = queue_prio;
 	p->array = array;
-	set_task_entitlement(p);
+	__set_bit(USER_PRIO(p->prio), p->bitmap);
 }
 
 /*
  * Adding to a runqueue. The dynamic priority queue that it is added to is
- * determined by the priority rotation of the runqueue it is being added to
- * and the quota still available in the task in p->bitmap and p->time_slice
- * (see recalc_task_prio above).
+ * determined by recalc_task_prio() above.
  */
 static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
 {
@@ -808,13 +826,14 @@ static void requeue_task(struct task_str
  * task_timeslice - the total duration a task can run during one major
  * rotation.
  */
-static inline unsigned int task_timeslice(struct task_struct *p)
+static inline int task_timeslice(struct task_struct *p)
 {
-	unsigned int slice, rr;
+	int slice, rr;
 
 	slice = rr = p->quota;
 	if (!rt_task(p))
 		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
+	slice = NS_TO_JIFFIES(slice) ? : 1;
 	return slice;
 }
 
@@ -921,22 +940,24 @@ static int effective_prio(struct task_st
 }
 
 /*
- * All tasks have quotas based on rr_interval. From nice 0 to 19 they are
- * all equal to it and below zero they get exponentially larger making their
- * effective quota significantly larger. rt tasks all get rr_interval.
- * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval
- * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0
- * similar to the ratios between 0 and +19.
+ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval.
+ * From nice 1 to 19 they are smaller than it only if they are at least one
+ * tick still. Below nice 0 they get progressively larger.
+ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
+ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
  */
 static unsigned int rr_quota(struct task_struct *p)
 {
 	int nice = TASK_NICE(p), rr = rr_interval;
 
-	if (nice < -6 && !rt_task(p)) {
-		rr *= nice * nice;
-		rr /= 40;
+	if (!rt_task(p)) {
+		if (nice < -6) {
+			rr *= nice * nice;
+			rr /= 40;
+		} else if (nice > 0 && (rr * HZ / 1000 / 2) > 0)
+			rr /= 2;
 	}
-	return rr;
+	return MS_TO_NS(rr);
 }
 
 /*
@@ -1550,7 +1571,6 @@ int fastcall wake_up_state(struct task_s
 	return try_to_wake_up(p, state, 0);
 }
 
-static void task_running_tick(struct rq *rq, struct task_struct *p, int tick);
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1596,8 +1616,9 @@ void fastcall sched_fork(struct task_str
 	 * total amount of pending timeslices in the system doesn't change,
 	 * resulting in more scheduling fairness.
 	 */
-	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
+	if (unlikely(p->time_slice < 2))
+		p->time_slice = 2;
+	p->time_slice = current->time_slice >> 1;
 	/*
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
@@ -1605,16 +1626,6 @@ void fastcall sched_fork(struct task_str
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
-	if (!current->time_slice) {
-		/*
-		 * This case happens when the parent has only a single jiffy
-		 * left from its timeslice. Taking the runqueue lock is not
-		 * a problem.
-		 */
-		current->time_slice = 1;
-		task_running_tick(cpu_rq(cpu), current, 0);
-	}
-	local_irq_enable();
 out:
 	put_cpu();
 }
@@ -2026,43 +2037,6 @@ void sched_exec(void)
 }
 
 /*
- * This is a unique version of enqueue_task for the SMP case where a task
- * has just been moved across runqueues. It uses the information from the
- * old runqueue to help it make a decision much like recalc_task_prio. As
- * the new runqueue is almost certainly at a different prio_level than the
- * src_rq it is cheapest just to pick the next entitled slot.
- */
-static inline void enqueue_pulled_task(struct rq *src_rq, struct rq *rq,
-				       struct task_struct *p)
-{
-	int queue_prio;
-
-	p->array = rq->active;
-	if (!rt_task(p)) {
-		if (p->rotation == src_rq->prio_rotation) {
-			if (p->array == src_rq->expired) {
-				queue_expired(p, rq);
-				goto out_queue;
-			}
-		} else
-			task_new_array(p, rq);
-	}
-	queue_prio = next_entitled_slot(p, rq);
-	if (queue_prio >= MAX_PRIO) {
-		queue_expired(p, rq);
-		goto out_queue;
-	}
-	rq_quota(rq, queue_prio) += p->quota;
-	p->prio = queue_prio;
-out_queue:
-	p->normal_prio = p->prio;
-	p->rotation = rq->prio_rotation;
-	sched_info_queued(p);
-	set_dynamic_bit(p, rq);
-	list_add_tail(&p->run_list, p->array->queue + p->prio);
-}
-
-/*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
@@ -2073,7 +2047,7 @@ static void pull_task(struct rq *src_rq,
 	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
-	enqueue_pulled_task(src_rq, this_rq, p);
+	enqueue_task(p, this_rq);
 	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
 				+ this_rq->most_recent_timestamp;
 	try_preempt(p, this_rq);
@@ -2998,7 +2972,12 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
+	cputime64_t time_diff = now - p->last_ran;
+
+	/* cpu scheduler quota accounting is performed here */
+	if (p != rq->idle && p->policy != SCHED_FIFO)
+		p->time_slice -= time_diff;
+	p->sched_time += time_diff;
 	p->last_ran = rq->most_recent_timestamp = now;
 }
 
@@ -3100,7 +3079,6 @@ static void task_expired_entitlement(str
 	struct prio_array *old_array;
 	int old_prio;
 
-	set_tsk_need_resched(p);
 	if (unlikely(p->first_time_slice))
 		p->first_time_slice = 0;
 	if (rt_task(p)) {
@@ -3114,122 +3092,21 @@ static void task_expired_entitlement(str
 	requeue_task(p, rq, old_array, old_prio);
 }
 
-/*
- * A major priority rotation occurs when all priority quotas for this array
- * have been exhausted.
- */
-static inline void major_prio_rotation(struct rq *rq)
-{
-	struct prio_array *new_array = rq->expired;
-
-	rq->expired = rq->active;
-	rq->active = new_array;
-	rq->exp_bitmap = rq->expired->prio_bitmap;
-	rq->dyn_bitmap = rq->active->prio_bitmap;
-	rq->best_static_prio = MAX_PRIO - 1;
-	rq->prio_rotation++;
-}
-
-/*
- * This is the heart of the virtual deadline priority management.
- *
- * We have used up the quota allocated to this priority level so we rotate
- * the prio_level of the runqueue to the next lowest priority. We merge any
- * remaining tasks at this level current_queue with the next priority and
- * reset this level's queue. MAX_PRIO - 1 is a special case where we perform
- * a major rotation.
- */
-static inline void rotate_runqueue_priority(struct rq *rq)
-{
-	int new_prio_level;
-	struct prio_array *array;
-
-	/*
-	 * Make sure we don't have tasks still on the active array that
-	 * haven't run due to not preempting a lower priority task. This can
-	 * happen on list merging or smp balancing.
-	 */
-	if (unlikely(sched_find_first_bit(rq->dyn_bitmap) < rq->prio_level))
-		return;
-
-	array = rq->active;
-	if (rq->prio_level > MAX_PRIO - 2) {
-		/* Major rotation required */
-		struct prio_array *new_queue = rq->expired;
-
-		/*
-		 * On a major rotation we move everything remaining to best
-		 * priority on the new array. The priority matrix bitmap will
-		 * ensure tasks only get the slots each static priority
-		 * deserves.
-		 */
-		new_prio_level = MAX_RT_PRIO;
-		if (!list_empty(array->queue + rq->prio_level)) {
-			list_splice_tail_init(array->queue + rq->prio_level,
-					 new_queue->queue + new_prio_level);
-		}
-		memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota));
-		major_prio_rotation(rq);
-	} else {
-		/* Minor rotation */
-		new_prio_level = rq->prio_level + 1;
-		__clear_bit(rq->prio_level, rq->dyn_bitmap);
-		if (!list_empty(array->queue + rq->prio_level)) {
-			list_splice_tail_init(array->queue + rq->prio_level,
-					 array->queue + new_prio_level);
-			__set_bit(new_prio_level, rq->dyn_bitmap);
-		}
-		rq_quota(rq, rq->prio_level) = 0;
-	}
-	rq->prio_level = new_prio_level;
-	/*
-	 * As we are merging to a prio_level that may not have anything in
-	 * its quota we add 1 to ensure the tasks get to run in schedule() to
-	 * add their quota to it.
-	 */
-	rq_quota(rq, new_prio_level) += 1;
-}
-
-static void task_running_tick(struct rq *rq, struct task_struct *p, int tick)
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
-	if (unlikely(!task_queued(p))) {
-		/* Task has expired but was not scheduled yet */
-		set_tsk_need_resched(p);
-		return;
-	}
 	/* SCHED_FIFO tasks never run out of timeslice. */
-	if (unlikely(p->policy == SCHED_FIFO))
+	if (p->time_slice > 0 || p->policy == SCHED_FIFO)
 		return;
-
 	spin_lock(&rq->lock);
-	/*
-	 * Accounting is performed by both the task and the runqueue. This
-	 * allows frequently sleeping tasks to get their proper quota of
-	 * cpu as the runqueue will have their quota still available at
-	 * the appropriate priority level. It also means frequently waking
-	 * tasks that might miss the scheduler_tick() will get forced down
-	 * priority regardless.
-	 */
-	if (!--p->time_slice)
-		task_expired_entitlement(rq, p);
-	/*
-	 * If we're actually calling this function not in a scheduler_tick
-	 * we are doing so to fix accounting across fork and should not be
-	 * deducting anything from rq_quota.
-	 */
-	if (!tick)
-		goto out_unlock;
-	/*
-	 * We only employ the deadline mechanism if we run over the quota.
-	 * It allows aliasing problems around the scheduler_tick to be
-	 * less harmful.
-	 */
-	if (!rt_task(p) && --rq_quota(rq, rq->prio_level) < 0) {
-		if (unlikely(p->first_time_slice))
-			p->first_time_slice = 0;
-		rotate_runqueue_priority(rq);
+	if (unlikely(!task_queued(p))) {
+		/* Task has expired but was not scheduled off yet */
 		set_tsk_need_resched(p);
+		goto out_unlock;
 	}
+	/* p->time_slice <= 0 */
+	task_expired_entitlement(rq, p);
+	set_tsk_need_resched(p);
 out_unlock:
 	spin_unlock(&rq->lock);
 }
@@ -3248,7 +3125,7 @@ void scheduler_tick(void)
 	update_cpu_clock(p, rq, now);
 
 	if (p != rq->idle)
-		task_running_tick(rq, p, 1);
+		task_running_tick(rq, p);
 #ifdef CONFIG_SMP
 	update_load(rq);
 	if (time_after_eq(jiffies, rq->next_balance))
@@ -3295,81 +3172,42 @@ EXPORT_SYMBOL(sub_preempt_count);
 #endif
 
 /*
- * If a task is queued at a priority that isn't from its bitmap we exchange
- * by setting one of the entitlement bits.
- */
-static inline void exchange_slot(struct task_struct *p, struct rq *rq)
-{
-	int slot = next_entitled_slot(p, rq);
-
-	if (slot < MAX_PRIO)
-		__set_bit(USER_PRIO(slot), p->bitmap);
-}
-
-/*
- * next_dynamic_task finds the next suitable dynamic task. As the dyn_bitmap
- * contains all the active and expired dynamic tasks sequentially we only
- * need to do one bitmap lookup.
+ * next_dynamic_task finds the next suitable dynamic task.
  */
 static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
 {
 	struct task_struct *next;
 	struct list_head *queue;
 	struct prio_array *array = rq->active;
-	int expirations = 0;
 
 retry:
 	if (idx >= MAX_PRIO) {
-		BUG_ON(++expirations > 1);
-		/*
-		 * We have selected a bit from the expired range so there are
-		 * no more tasks in the active array.
-		 */
-		major_prio_rotation(rq);
-		array = rq->active;
+		/* There are no more tasks in the active array. Swap arrays */
+		array = rq->expired;
+		rq->expired = rq->active;
+		rq->active = array;
+		rq->exp_bitmap = rq->expired->prio_bitmap;
+		rq->dyn_bitmap = rq->active->prio_bitmap;
+		rq->best_static_prio = MAX_PRIO - 1;
+		rq->prio_rotation++;
 		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
 	}
-	if (unlikely(list_empty(array->queue + idx))) {
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+	if (unlikely(next->time_slice < 0)) {
 		/*
-		 * This can happen because they are not always cleared on
-		 * dequeue_task since they may have been dequeued while
-		 * waiting on a runqueue and a rotation has occurred in the
-		 * interim. A very rare occurrence.
+		 * Unlucky enough that this task ran out of time_slice
+		 * before it hit a scheduler_tick so it should have its
+		 * priority reassessed and choose another task (possibly
+		 * the same one)
 		 */
-		__clear_bit(idx, rq->dyn_bitmap);
-		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1);
+		task_expired_entitlement(rq, next);
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
 		goto retry;
 	}
-	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
 	rq->prio_level = idx;
-	/*
-	 * When the task is chosen it is checked to see if its quota has been
-	 * added to this runqueue level which is only performed once per
-	 * level per major rotation for each running task.
-	 */
-	if (next->rotation != rq->prio_rotation) {
-			/* Task has moved during major rotation */
-			task_new_array(next, rq);
-			if (!entitled_slot(next->static_prio, idx))
-				exchange_slot(next, rq);
-			set_task_entitlement(next);
-			rq_quota(rq, idx) += next->quota;
-	} else if (!test_bit(USER_PRIO(idx), next->bitmap)) {
-			/* Task has moved during minor rotation */
-			if (!entitled_slot(next->static_prio, idx))
-				exchange_slot(next, rq);
-			set_task_entitlement(next);
-			rq_quota(rq, idx) += next->quota;
-	}
-	/*
-	 * next needs to have its prio and array reset here in case the
-	 * values are wrong due to priority rotation.
-	 */
-	next->prio = idx;
-	next->array = array;
-	if (next->static_prio < rq->best_static_prio &&
-	    next->policy != SCHED_BATCH)
+	next->rotation = rq->prio_rotation;
+	if (next->static_prio < rq->best_static_prio)
 		rq->best_static_prio = next->static_prio;
 	return next;
 }
@@ -4043,7 +3881,7 @@ asmlinkage long sys_nice(int increment)
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from 0 to +19.
+ * around 0, value goes from 0 to +39.
  */
 int task_prio(const struct task_struct *p)
 {
@@ -4832,9 +4670,9 @@ void __cpuinit init_idle(struct task_str
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	bitmap_zero(idle->bitmap, PRIO_RANGE + 1);
-	idle->timestamp = sched_clock();
-	idle->array = NULL;
+	bitmap_zero(idle->bitmap, PRIO_RANGE);
+	idle->timestamp = idle->last_ran = sched_clock();
+	idle->array = rq->active;
 	idle->prio = idle->normal_prio = NICE_TO_PRIO(0);
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
@@ -6824,6 +6662,8 @@ void __init sched_init(void)
 		rq->exp_bitmap = rq->expired->prio_bitmap;
 
 #ifdef CONFIG_SMP
+		rq->active->rq = rq;
+		rq->expired->rq = rq;
 		rq->sd = NULL;
 		for (j = 1; j < 3; j++)
 			rq->cpu_load[j] = 0;
@@ -6844,14 +6684,12 @@ void __init sched_init(void)
 			/* delimiter for bitsearch */
 			__set_bit(MAX_PRIO, array->prio_bitmap);
 		}
-		for (k = 0; k < PRIO_RANGE; k++)
-			rq->prio_quota[k] = 0;
 
 		/* Every added cpu increases the rr_interval */
 		rr_us += rr_inc;
 		rr_inc /= 2;
 	}
-	rr_interval = rr_us / 1000 ? : 1;
+	rr_interval = rr_us / 1000;
 
 	set_load_weight(&init_task);
 
