--- Documentation/sched-design.txt | 167 +++++++++-------- include/asm-generic/bitops/sched.h | 10 - include/asm-s390/bitops.h | 12 + kernel/sched.c | 354 +++++++++++++++++++------------------ 4 files changed, 288 insertions(+), 255 deletions(-) Index: linux-2.6.20.3-rsdl/Documentation/sched-design.txt =================================================================== --- linux-2.6.20.3-rsdl.orig/Documentation/sched-design.txt 2007-03-17 00:00:28.000000000 +1100 +++ linux-2.6.20.3-rsdl/Documentation/sched-design.txt 2007-03-17 00:01:15.000000000 +1100 @@ -196,17 +196,16 @@ constraints of strict fairness. Design description ================== -RSDL works off the principle of providing each task a quota of runtime that -it is allowed to run at each priority level equal to its static priority -(ie. its nice level) and every priority below that. When each task is queued, -the cpu that it is queued onto also keeps a record of that quota. If the -task uses up its quota it is decremented one priority level. Also, if the cpu -notices a quota full has been used for that priority level, it pushes -everything remaining at that priority level to the next lowest priority -level. Once every runtime quota has been consumed of every priority level, -a task is queued on the "expired" array. When no other tasks exist with -quota, the expired array is activated and fresh quotas are handed out. This -is all done in O(1). +RSDL works off the principle of providing each task a quota of runtime that it +is allowed to run at a number of priority levels determined by its static +priority (ie. its nice level). When each task is queued, the cpu that it is +queued onto also keeps a record of that quota. If the task uses up its quota it +has its priority decremented to the next level. Also, if the cpu notices a quota +full has been used for that priority level, it pushes everything remaining at +that priority level to the next lowest priority level. Once every runtime quota +has been consumed of every priority level, a task is queued on the "expired" +array. When no other tasks exist with quota, the expired array is activated and +fresh quotas are handed out. This is all done in O(1). Design details @@ -227,22 +226,25 @@ on in p->rotation. It also keeps a recor already been allocated quota from during this epoch in a bitmap p->bitmap. The only tunable that determines all other details is the RR_INTERVAL. This -is set to 6ms (minimum on 1000HZ, higher at different HZ values). +is set to 8ms (minimum on 1000HZ, higher at different HZ values), and is +scaled gently upwards with more cpus. All tasks are initially given a quota based on RR_INTERVAL. This is equal to -RR_INTERVAL between nice values of 0 and 19, and progressively larger for -nice values from -1 to -20. This is assigned to p->quota and only changes -with changes in nice level. - -As a task is first queued, it checks in recalc_task_prio to see if it has -run at this runqueue's current priority rotation. If it has not, it will -have its p->prio level set to equal its p->static_prio (nice level) and will -be given a p->time_slice equal to the p->quota, and has its allocation -bitmap bit set in p->bitmap for its static priority (nice value). This -quota is then also added to the current runqueue's rq->prio_quota[p->prio]. -It is then queued on the current active priority array. +RR_INTERVAL between nice values of 0 and 19, and progressively larger for nice +values from -1 to -20. This is to maintain a relationship of nice 19 having +approximately 1/20th of the cpu of nice 0, and nice 0 having 1/20th the cpu of +nice -20. This is assigned to p->quota and only changes with changes in nice +level. + +As a task is first queued, it checks in recalc_task_prio to see if it has run at +this runqueue's current priority rotation. If it has not, it will have its +p->prio level set according to the first slot in a "priority matrix" and will be +given a p->time_slice equal to the p->quota, and has its allocation bitmap bit +set in p->bitmap for this prio level. This quota is then also added to the +current runqueue's rq->prio_quota[p->prio]. It is then queued on the current +active priority array. -If a task has already been running during this major epoch, if it has +If a task has already been running during this major epoch, and it has p->time_slice left and the rq->prio_quota for the task's p->prio still has quota, it will be placed back on the active array, but no more quota will be added to either the task or the runqueue quota. @@ -260,12 +262,7 @@ any entitlement left in p->bitmap and no bitmap cleared, and be queued at its p->static_prio again, but on the expired priority array. No quota will be allocated until this task is scheduled. -When a task is queued, it has its static_prio bit set in the current -runqueue's rq->static_bitmap, and the relevant bit in the rq->dyn_bitmap. -In order to minimise the number of bitmap lookups, the bitmap of queued -tasks on the expired array is at the end of the same bitmap as the active -array. The number of tasks queued at the current static_prio is kept in -rq->prio_queued[]. +When a task is queued, it has its relevant bit set in the array->prio_bitmap. During a scheduler_tick where a task is running, the p->time_slice is decremented, and if it reaches zero then the recalc_task_prio is readjusted @@ -280,7 +277,7 @@ A minor rotation takes the remaining tas merges them with a list_splice_tail with the queue from the next lowest priority level. At this time, any tasks that have been merged will now have invalid values in p->prio so this must be considered when dequeueing -the task, and for testing for preemption. +and scheduling the task. A major rotation takes the remaining tasks at this priority level queue and merges them with a list_splice_tail with the best priority task running on @@ -293,16 +290,43 @@ When a task is dequeued, the dyn_bitmap that the relevant queue is actually empty since p->prio may be inaccurate and no hard accounting of the number of tasks at that level is possible. -When selecting a new task for scheduling, after the first dynamic bit is -found on the dyn_bitmap, it is checked to see that a task is really queued -at that priority or if it is a false positive due to the task being -dequeued at a time when its p->prio does not match which queue it is on -after some form of priority rotation. This is a rare occurrence as it tends -to only occur if a task that is already waiting on a runqueue gets dequeued. -If the bitmap value is in the expired array range, a major priority rotation -is performed. If the chosen task has not been running during this major or -minor rotation it has new quota allocated at this time, and added to the -runqueue's quota. +When selecting a new task for scheduling, after the first dynamic bit is found +on the dyn_bitmap, it is checked to see that a task is really queued at that +priority or if it is a false positive due to the task being dequeued at a time +when its p->prio does not match which queue it is on after some form of priority +rotation. This is a rare occurrence as it tends to only occur if a task that is +already waiting on a runqueue gets dequeued. If no tasks remain on the active +array, a major priority rotation is performed. If the chosen task has not been +running during this major or minor rotation it has new quota allocated at this +time, and added to the runqueue's quota. + +If a task finds itself merged at a priority level that it does not normally +receive quota at (due to list merging) it will remove one of its normal +priority slots to compensate. + + +Priority Matrix +=============== + +In order to minimise the latencies between tasks of different nice levels +running concurrently, the dynamic priority slots where different nice levels +are queued are dithered instead of being sequential. What this means is that +there are 40 priority slots where a task may run during one major rotation, +and the allocation of slots is dependant on nice level. In the +following table, a zero represents a slot where the task may run. + +nice -20 0000000000000000000000000000000000000000 +nice -10 1001000100100010001001000100010010001000 +nice 0 0101010101010101010101010101010101010101 +nice 5 1101011010110101101011010110101101011011 +nice 10 0110111011011101110110111011101101110111 +nice 15 0111110111111011111101111101111110111111 +nice 19 1111111111111111111011111111111111111111 + +As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 +task only runs one slot per major rotation. This dithered table allows for the +smallest possible maximum latencies between tasks of varying nice levels, thus +allowing vastly different nice levels to be used. Modelling deadline behaviour @@ -315,16 +339,16 @@ conditions. This is a virtual deadline m runqueue epochs, and not by trying to keep complicated accounting of each task. -The maximum duration a task can run during one major epoch is determined -by its nice value. Nice 0 tasks can run at 19 different priority levels -for RR_INTERVAL duration during each epoch (the equivalent of nice 0 to nice -19). Nice 10 tasks can run at 9 priority levels for each epoch, and so on. +The maximum duration a task can run during one major epoch is determined by its +nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL +duration during each epoch. Nice 10 tasks can run at 9 priority levels for each +epoch, and so on. The table in the priority matrix above demonstrates how this +is enforced. Therefore the maximum duration a runqueue epoch can take is determined by the number of tasks running, and their nice level. After that, the maximum duration it can take before a task can wait before it get scheduled is -determined by the difference between its nice value and the nice value of -the highest priority task queued. +determined by the position of its first slot on the matrix. In the following examples, these are _worst case scenarios_ and would rarely occur, but can be modelled nonetheless to determine the maximum possible @@ -335,41 +359,21 @@ another is activated for the first time runqueue rotation, the first task will wait: nr_tasks * max_duration + nice_difference * rr_interval -1 * 19 * RR_INTERVAL + 0 = 114ms +1 * 19 * RR_INTERVAL + 0 = 152ms In the presence of a nice 10 task, a nice 0 task would wait a maximum of -1 * 10 * RR_INTERVAL + 0 = 60ms +1 * 10 * RR_INTERVAL + 0 = 80ms In the presence of a nice 0 task, a nice 10 task would wait a maximum of -1 * 19 * RR_INTERVAL + 9 * RR_INTERVAL = 168ms +1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms -Using a more complicated example, if there are 4 tasks running fully cpu -bound, one each at nice -20, nice 0, nice 10 and nice 19, we can calculate -the maximum latency possible for the nice 10 task. Note that -20 tasks are -heavily biased for so this will be a long time, but can be modelled. - -The nice -20 task has quota = RR_INTERVAL + 20*RR_INTERVAL = 21*RR_INTERVAL. -It can run at 39 priority levels so its maximum duration = -39 * 21 * RR_INTERVAL. -The nice 0 task works out to -19 * RR_INTERVAL -The nice 19 task works out to -RR_INTERVAL. - -So major epoch can take up a maximum of -39 * 21 * RR_INTERVAL + 19 * RR_INTERVAL + RR_INTERVAL = 1229 * RR_INTERVAL; - -Then before the nice 10 task will run, the nice -20 and nice 0 task will -run for 28 * 21 * RR_INTERVAL and 9 * RR_INTERVAL respectively for a total -of 597 * RR_INTERVAL. - -This means the maximum duration a nice 10 task can wait in the presence of -these other tasks is 1826*RR_INTERVAL. This is a long time of course and is -heavily penalised by the presence of nice -20 tasks which would not be part -of a normal environment. - -While this section describes the maximum latency a task can have, this size -latencies will only be seen by fully cpu bound tasks. +More useful than these values, though, are the average latencies which are +a matter of determining the average distance between priority slots of +different nice values and multiplying them by the tasks' quota. For example +in the presence of a nice -10 task, a nice 0 task will wait either one or +two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, +this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or +20 and 40ms respectively (on uniprocessor at 1000HZ). Achieving interactivity @@ -421,6 +425,13 @@ current task if it is not of a sleeping low latency for interactive tasks, and the lowest latencies for the least cpu bound tasks. +One of the potential disadvantages of a strict fairness design is that users +may prefer a degree of unfairness towards certain tasks (such as a gui) and +will notice the relative slowdown that occurs under load. As the dithered +matrix minimises the latencies when differential nice levels are used, this +can be countered by running a gui at a negative nice value such as -10 without +causing adversely large latencies in nice 0 tasks. + -Wed, 28 Feb 2007 +Fri, 16 Mar 2007 Con Kolivas Index: linux-2.6.20.3-rsdl/include/asm-generic/bitops/sched.h =================================================================== --- linux-2.6.20.3-rsdl.orig/include/asm-generic/bitops/sched.h 2007-03-17 00:00:28.000000000 +1100 +++ linux-2.6.20.3-rsdl/include/asm-generic/bitops/sched.h 2007-03-17 00:01:15.000000000 +1100 @@ -6,8 +6,8 @@ /* * Every architecture must define this function. It's the fastest - * way of searching a 180-bit bitmap where the first 100 bits are - * unlikely to be set. It's guaranteed that at least one of the 180 + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 * bits is cleared. */ static inline int sched_find_first_bit(const unsigned long *b) @@ -15,7 +15,7 @@ static inline int sched_find_first_bit(c #if BITS_PER_LONG == 64 if (unlikely(b[0])) return __ffs(b[0]); - if (b[1]) + if (likely(b[1])) return __ffs(b[1]) + 64; return __ffs(b[2]) + 128; #elif BITS_PER_LONG == 32 @@ -27,9 +27,7 @@ static inline int sched_find_first_bit(c return __ffs(b[2]) + 64; if (b[3]) return __ffs(b[3]) + 96; - if (b[4]) - return __ffs(b[4]) + 128; - return __ffs(b[5]) + 160; + return __ffs(b[4]) + 128; #else #error BITS_PER_LONG not defined #endif Index: linux-2.6.20.3-rsdl/include/asm-s390/bitops.h =================================================================== --- linux-2.6.20.3-rsdl.orig/include/asm-s390/bitops.h 2007-03-17 00:00:28.000000000 +1100 +++ linux-2.6.20.3-rsdl/include/asm-s390/bitops.h 2007-03-17 00:01:15.000000000 +1100 @@ -729,7 +729,17 @@ find_next_bit (const unsigned long * add return offset + find_first_bit(p, size); } -#include +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + return find_first_bit(b, 140); +} + #include #include Index: linux-2.6.20.3-rsdl/kernel/sched.c =================================================================== --- linux-2.6.20.3-rsdl.orig/kernel/sched.c 2007-03-17 00:00:28.000000000 +1100 +++ linux-2.6.20.3-rsdl/kernel/sched.c 2007-03-17 00:03:17.000000000 +1100 @@ -17,7 +17,7 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin * 2007-03-02 Rotating Staircase deadline scheduling policy by Con Kolivas - * RSDL v0.30 + * RSDL v0.31 */ #include @@ -76,29 +76,44 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) #define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) -#define MAX_DYN_PRIO (MAX_PRIO + PRIO_RANGE) -/* - * Preemption needs to take into account that a low priority task can be - * at a higher prio due to list merging. Its priority is artificially - * elevated and it should be preempted if anything higher priority wakes up - * provided it is not a realtime comparison. - */ -#define TASK_PREEMPTS_CURR(p, curr) \ - (((p)->prio < (curr)->prio) || (!rt_task(p) && \ - ((p)->static_prio < (curr)->static_prio && \ - ((curr)->static_prio > (curr)->prio)))) +#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) /* * This is the time all tasks within the same priority round robin. - * Set to a minimum of 6ms. + * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ. + */ +static unsigned int rr_interval __read_mostly; +#define RR_INTERVAL 8 +#define DEF_TIMESLICE (rr_interval * 20) + +/* + * This contains a bitmap for each dynamic priority level with empty slots + * for the valid priorities each different nice level can have. It allows + * us to stagger the slots where differing priorities run in a way that + * keeps latency differences between different nice levels at a minimum. + * ie, where 0 means a slot for that priority, priority running from left to + * right: + * nice -20 0000000000000000000000000000000000000000 + * nice -10 1001000100100010001001000100010010001000 + * nice 0 0101010101010101010101010101010101010101 + * nice 5 1101011010110101101011010110101101011011 + * nice 10 0110111011011101110110111011101101110111 + * nice 15 0111110111111011111101111101111110111111 + * nice 19 1111111111111111111011111111111111111111 */ -#define RR_INTERVAL ((6 * HZ / 1001) + 1) -#define DEF_TIMESLICE (RR_INTERVAL * 20) +static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] + __read_mostly; struct prio_array { struct list_head queue[MAX_PRIO]; /* Tasks queued at each priority */ + + DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); + /* + * The bitmap of priorities queued; The dynamic bits can have + * false positives. Include 1 bit for delimiter. + */ }; /* @@ -136,20 +151,6 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - DECLARE_BITMAP(dyn_bitmap, MAX_DYN_PRIO + 1); - /* - * The bitmap of priorities queued; The extra PRIO_RANGE at the end - * is for a bitmap of expired tasks queued. This minimises the number - * of bit lookups over prio_array swaps. The dynamic bits can have - * false positives. Include 1 bit for delimiter. - */ - - DECLARE_BITMAP(static_bitmap, MAX_PRIO); - /* The bitmap of all static priorities queued */ - - unsigned long prio_queued[MAX_PRIO]; - /* The number of tasks at each static priority */ - long prio_quota[PRIO_RANGE]; /* * The quota of ticks the runqueue runs at each dynamic priority @@ -157,6 +158,7 @@ struct rq { */ struct prio_array *active, *expired, arrays[2]; + unsigned long *dyn_bitmap, *exp_bitmap; int prio_level; /* The current dynamic priority level this runqueue is at */ @@ -609,34 +611,16 @@ static inline int task_queued(struct tas static inline void set_task_entitlement(struct task_struct *p) { __set_bit(USER_PRIO(p->prio), p->bitmap); - - /* - * In the case this task has been part of a merged list that has - * made it to higher priority than it should be, we remove the - * quota from its own priority since it will get a quota at this - * priority. - */ - if (p->normal_prio < p->static_prio) - __set_bit(USER_PRIO(p->static_prio), p->bitmap); p->time_slice = p->quota; } /* - * Only the static_bitmap has hard accounting. The dynamic bits can have + * There is no specific hard accounting. The dynamic bits can have * false positives. rt_tasks can only be on the active queue. */ static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) { - if (p->array == rq->active) - __set_bit(p->prio, rq->dyn_bitmap); - else - __set_bit(p->prio + PRIO_RANGE, rq->dyn_bitmap); -} - -static inline void set_queue_bits(struct rq *rq, struct task_struct *p) -{ - __set_bit(p->static_prio, rq->static_bitmap); - set_dynamic_bit(p, rq); + __set_bit(p->prio, p->array->prio_bitmap); } /* @@ -648,15 +632,8 @@ static inline void set_queue_bits(struct static void dequeue_task(struct task_struct *p, struct rq *rq) { list_del_init(&p->run_list); - if (!--rq->prio_queued[p->static_prio]) - __clear_bit(p->static_prio, rq->static_bitmap); - if (list_empty(p->array->queue + p->prio)) { - int bitmap_prio = p->prio; - - if (p->array == rq->expired) - bitmap_prio += PRIO_RANGE; - __clear_bit(bitmap_prio, rq->dyn_bitmap); - } + if (list_empty(p->array->queue + p->prio)) + __clear_bit(p->prio, p->array->prio_bitmap); } /* @@ -669,16 +646,31 @@ static inline void task_new_array(struct p->rotation = rq->prio_rotation; } -static inline void queue_expired(struct task_struct *p, struct rq *rq) +static inline int first_prio_slot(struct task_struct *p) +{ + return SCHED_PRIO(find_first_zero_bit( + prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); +} + +static inline int next_prio_slot(struct task_struct *p, int prio) +{ + DECLARE_BITMAP(tmp, PRIO_RANGE); + bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)], + PRIO_RANGE); + return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, + USER_PRIO(prio))); +} + +static void queue_expired(struct task_struct *p, struct rq *rq) { - p->prio = p->normal_prio = p->static_prio; p->array = rq->expired; - bitmap_zero(p->bitmap, PRIO_RANGE); - p->rotation = rq->prio_rotation; + task_new_array(p, rq); + p->prio = p->normal_prio = first_prio_slot(p); p->time_slice = p->quota; } #define rq_quota(rq, prio) ((rq)->prio_quota[USER_PRIO(prio)]) + /* * recalc_task_prio determines what prio a non rt_task will be * queued at. If the task has already been running during this runqueue's @@ -693,12 +685,23 @@ static inline void queue_expired(struct static void recalc_task_prio(struct task_struct *p, struct rq *rq) { struct prio_array *array = rq->active; - int queue_prio, search_prio; + int queue_prio, search_prio = MAX_RT_PRIO; + + /* + * SCHED_BATCH tasks never start at better priority than any other + * task that is already running since they are flagged as latency + * insensitive. This means they never cause greater latencies in other + * non SCHED_BATCH tasks of the same nice level, but they still will + * not be exposed to high latencies themselves. + */ + if (unlikely(p->policy == SCHED_BATCH)) + search_prio = rq->prio_level; if (p->rotation == rq->prio_rotation) { if (p->array == array) { if (p->time_slice && rq_quota(rq, p->prio)) return; + search_prio = p->prio; } else if (p->array == rq->expired) { queue_expired(p, rq); return; @@ -706,19 +709,9 @@ static void recalc_task_prio(struct task task_new_array(p, rq); } else task_new_array(p, rq); - search_prio = p->static_prio; - /* - * SCHED_BATCH tasks never start at better priority than any other - * task that is already running since they are flagged as latency - * insensitive. This means they never cause greater latencies in other - * non SCHED_BATCH tasks of the same nice level. - */ - if (unlikely(p->policy == SCHED_BATCH)) - search_prio = max(p->static_prio, rq->prio_level); - queue_prio = SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, - USER_PRIO(search_prio))); - if (queue_prio == MAX_PRIO) { + queue_prio = next_prio_slot(p, search_prio); + if (queue_prio >= MAX_PRIO) { queue_expired(p, rq); return; } @@ -732,9 +725,7 @@ static void recalc_task_prio(struct task * Adding to a runqueue. The dynamic priority queue that it is added to is * determined by the priority rotation of the runqueue it is being added to * and the quota still available in the task in p->bitmap and p->time_slice - * (see recalc_task_prio above). The rq static_bitmap stores a list of - * the static priorities, and prio_queued the number of tasks stored at each - * p->static_prio level. + * (see recalc_task_prio above). */ static inline void __enqueue_task(struct task_struct *p, struct rq *rq) { @@ -742,10 +733,9 @@ static inline void __enqueue_task(struct p->array = rq->active; else recalc_task_prio(p, rq); - rq->prio_queued[p->static_prio]++; sched_info_queued(p); - set_queue_bits(rq, p); + set_dynamic_bit(p, rq); } static void enqueue_task(struct task_struct *p, struct rq *rq) @@ -767,13 +757,12 @@ static inline void enqueue_task_head(str static void requeue_task(struct task_struct *p, struct rq *rq, struct prio_array *old_array, int old_prio) { + if (p->array == rq->expired) + queue_expired(p, rq); list_move_tail(&p->run_list, p->array->queue + p->prio); if (!rt_task(p)) { - if (list_empty(old_array->queue + old_prio)) { - if (old_array == rq->expired) - old_prio += PRIO_RANGE; - __clear_bit(old_prio, rq->dyn_bitmap); - } + if (list_empty(old_array->queue + old_prio)) + __clear_bit(old_prio, p->array->prio_bitmap); set_dynamic_bit(p, rq); } } @@ -796,7 +785,7 @@ static inline unsigned int task_timeslic unsigned int slice, rr; slice = rr = p->quota; - if (likely(!rt_task(p))) + if (!rt_task(p)) slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr; return slice; } @@ -811,7 +800,7 @@ static inline unsigned int task_timeslic (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) #define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(task_timeslice(p)) #define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp)))) + (LOAD_WEIGHT((rr_interval + 20 + (rp)))) static void set_load_weight(struct task_struct *p) { @@ -878,7 +867,10 @@ static inline int normal_prio(struct tas if (has_rt_policy(p)) return MAX_RT_PRIO-1 - p->rt_priority; /* Other tasks all have normal_prio set in recalc_task_prio */ - return p->static_prio; + if (likely(p->prio >= MAX_RT_PRIO)) + return p->prio; + else + return p->static_prio; } /* @@ -901,17 +893,22 @@ static int effective_prio(struct task_st } /* - * All tasks have quotas based on RR_INTERVAL. From nice 0 to 19 they are - * all equal to it and below zero they get progressively larger making their - * effective quota significantly larger. rt tasks all get RR_INTERVAL. - */ -static unsigned int rr_interval(struct task_struct *p) -{ - int nice = TASK_NICE(p); - - if (nice < 0 && !rt_task(p)) - return RR_INTERVAL * (20 - nice) / 20; - return RR_INTERVAL; + * All tasks have quotas based on rr_interval. From nice 0 to 19 they are + * all equal to it and below zero they get exponentially larger making their + * effective quota significantly larger. rt tasks all get rr_interval. + * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval + * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0 + * similar to the ratios between 0 and +19. + */ +static unsigned int rr_quota(struct task_struct *p) +{ + int neg_nice = -TASK_NICE(p), rr = rr_interval; + + if (neg_nice > 6 && !rt_task(p)) { + rr *= neg_nice * neg_nice; + rr /= 40; + } + return rr; } /* @@ -941,7 +938,7 @@ static void activate_task(struct task_st (now - p->timestamp) >> 20); } - p->quota = rr_interval(p); + p->quota = rr_quota(p); p->prio = effective_prio(p); p->timestamp = now; __activate_task(p, rq); @@ -2056,6 +2053,17 @@ int can_migrate_task(struct task_struct return 1; } +static inline int rq_best_prio(struct rq *rq) +{ + int best_prio, exp_prio; + + best_prio = sched_find_first_bit(rq->dyn_bitmap); + exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); + if (unlikely(best_prio > exp_prio)) + best_prio = exp_prio; + return best_prio; +} + /* * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted * load from busiest to this_rq, as part of a balancing operation within @@ -2068,7 +2076,7 @@ static int move_tasks(struct rq *this_rq struct sched_domain *sd, enum idle_type idle, int *all_pinned) { - int idx, test_idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, best_prio_seen, skip_for_load; struct prio_array *array; struct list_head *head, *curr; @@ -2080,8 +2088,8 @@ static int move_tasks(struct rq *this_rq rem_load_move = max_load_move; pinned = 1; - this_best_prio = this_rq->curr->prio; - best_prio = busiest->curr->prio; + this_best_prio = rq_best_prio(this_rq); + best_prio = rq_best_prio(busiest); /* * Enable handling of the case where there is more than one task * with the best priority. If the current running task is one @@ -2095,33 +2103,27 @@ static int move_tasks(struct rq *this_rq * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect - * on them. This is done by starting the search at priority - * MAX_PRIO since expired bits are MAX_PRIO...MAX_DYN_PRIO-1 + * on them. */ array = busiest->expired; - test_idx = MAX_PRIO; +new_array: + /* Start searching at priority 0: */ + idx = 0; skip_bitmap: - if (!test_idx) - idx = sched_find_first_bit(busiest->dyn_bitmap); + if (!idx) + idx = sched_find_first_bit(array->prio_bitmap); else - idx = find_next_bit(busiest->dyn_bitmap, MAX_DYN_PRIO, - test_idx); - if (idx >= MAX_DYN_PRIO) { + idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { if (array == busiest->expired) { array = busiest->active; - test_idx = 0; - goto skip_bitmap; + goto new_array; } goto out; } - test_idx = idx; - if (idx >= MAX_PRIO) { - if (array == busiest->active) - goto out; - idx -= PRIO_RANGE; - } - if (list_empty(array->queue + idx)) { - __clear_bit(test_idx, busiest->dyn_bitmap); + + if (unlikely(list_empty(array->queue + idx))) { + __clear_bit(idx, array->prio_bitmap); goto skip_bitmap; } @@ -2146,7 +2148,7 @@ skip_queue: best_prio_seen |= idx == best_prio; if (curr != head) goto skip_queue; - test_idx++; + idx++; goto skip_bitmap; } @@ -2163,7 +2165,7 @@ skip_queue: this_best_prio = idx; if (curr != head) goto skip_queue; - test_idx++; + idx++; goto skip_bitmap; } out: @@ -3071,10 +3073,9 @@ static inline void major_prio_rotation(s rq->expired = rq->active; rq->active = new_array; + rq->exp_bitmap = rq->expired->prio_bitmap; + rq->dyn_bitmap = rq->active->prio_bitmap; rq->prio_rotation++; - bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO); - bitmap_copy(rq->dyn_bitmap, rq->static_bitmap, MAX_PRIO); - __set_bit(MAX_DYN_PRIO, rq->dyn_bitmap); } /* @@ -3105,14 +3106,12 @@ static inline void rotate_runqueue_prior struct prio_array *new_queue = rq->expired; /* - * The static_bitmap gives us the highest p->static prio task - * that is queued. This value is used as the prio after - * the major rotation and all tasks remaining on this - * active queue are moved there. This means tasks can end - * up a p->prio better than their p->static_prio. + * On a major rotation we move everything remaining to best + * priority on the new array. The priority matrix bitmap will + * ensure tasks only get the slots each static priority + * deserves. */ - new_prio_level = find_next_bit(rq->static_bitmap, MAX_PRIO, - MAX_RT_PRIO); + new_prio_level = MAX_RT_PRIO; if (!list_empty(array->queue + rq->prio_level)) { list_splice_tail_init(array->queue + rq->prio_level, new_queue->queue + new_prio_level); @@ -3374,39 +3373,23 @@ EXPORT_SYMBOL(sub_preempt_count); #endif +/* Is a dynamic_prio part of the allocated slots for this static_prio */ +static inline int entitled_slot(int static_prio, int dynamic_prio) +{ + return !test_bit(USER_PRIO(dynamic_prio), + prio_matrix[USER_PRIO(static_prio)]); +} + /* - * Leave this debugging in until we are certain all bitmap manipulations are - * working as desired since we can safely get out of this situation. + * If a task is queued at a priority that isn't from its bitmap we exchange + * by setting one of the entitlement bits. */ -static noinline int rq_bitmap_error(struct rq *rq) +static inline void exchange_slot(struct task_struct *p, int prio) { - static int bitmap_error = 0; - struct prio_array *array; - struct list_head *queue; - int idx, test_idx; + int slot = next_prio_slot(p, prio); - printk(KERN_ERR - "SCHEDULER BITMAP ERROR %d - attempting to reconstruct...\n", - ++bitmap_error); - for (test_idx = MAX_RT_PRIO ; test_idx < MAX_DYN_PRIO ; test_idx++) { - if (test_idx < MAX_PRIO) { - idx = test_idx; - array = rq->active; - } else { - idx = test_idx - PRIO_RANGE; - array = rq->expired; - } - queue = array->queue + idx; - if (!list_empty(queue)) { - if (!test_bit(test_idx, rq->dyn_bitmap)) { - __set_bit(test_idx, rq->dyn_bitmap); - } - } - } - idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, MAX_RT_PRIO); - /* We hit a real bug. There is no way out of this */ - BUG_ON(idx == MAX_DYN_PRIO); - return idx; + if (slot < MAX_PRIO) + __set_bit(USER_PRIO(slot), p->bitmap); } /* @@ -3419,18 +3402,18 @@ static inline struct task_struct *next_d struct task_struct *next; struct list_head *queue; struct prio_array *array = rq->active; + int expirations = 0; retry: - if (unlikely(idx == MAX_DYN_PRIO)) - idx = rq_bitmap_error(rq); if (idx >= MAX_PRIO) { + BUG_ON(++expirations > 1); /* * We have selected a bit from the expired range so there are * no more tasks in the active array. */ major_prio_rotation(rq); array = rq->active; - idx -= PRIO_RANGE; + idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); } if (unlikely(list_empty(array->queue + idx))) { /* @@ -3440,7 +3423,7 @@ retry: * interim. A very rare occurrence. */ __clear_bit(idx, rq->dyn_bitmap); - idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, idx + 1); + idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1); goto retry; } queue = array->queue + idx; @@ -3453,10 +3436,14 @@ retry: if (next->rotation != rq->prio_rotation) { /* Task has moved during major rotation */ task_new_array(next, rq); + if (!entitled_slot(next->static_prio, idx)) + exchange_slot(next, idx); set_task_entitlement(next); rq_quota(rq, idx) += next->quota; } else if (!test_bit(USER_PRIO(idx), next->bitmap)) { /* Task has moved during minor rotation */ + if (!entitled_slot(next->static_prio, idx)) + exchange_slot(next, idx); set_task_entitlement(next); rq_quota(rq, idx) += next->quota; } @@ -4072,6 +4059,7 @@ void set_user_nice(struct task_struct *p resched_task(rq->curr); } out_unlock: + p->quota = rr_quota(p); task_rq_unlock(rq, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -6874,6 +6862,26 @@ int in_sched_functions(unsigned long add void __init sched_init(void) { int i, j, k; + unsigned int rr_us = 0, rr_inc = RR_INTERVAL * 1000; + + /* Generate the priority matrix */ + for (i = 0; i < PRIO_RANGE; i++) { + if (i < 20) { + bitmap_zero(prio_matrix[i] , PRIO_RANGE); + j = PRIO_RANGE * PRIO_RANGE / (i + 1); + for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j) + __set_bit(k / PRIO_RANGE, prio_matrix[i]); + } else if (i == 20) { + bitmap_fill(prio_matrix[i], PRIO_RANGE); + for (k = 1; k < PRIO_RANGE; k += 2) + __clear_bit(k, prio_matrix[i]); + } else { + bitmap_fill(prio_matrix[i], PRIO_RANGE); + j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i + 1); + for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j) + __clear_bit(k / PRIO_RANGE, prio_matrix[i]); + } + } for_each_possible_cpu(i) { struct prio_array *array; @@ -6887,6 +6895,8 @@ void __init sched_init(void) rq->prio_level = MAX_RT_PRIO; rq->active = rq->arrays; rq->expired = rq->arrays + 1; + rq->dyn_bitmap = rq->active->prio_bitmap; + rq->exp_bitmap = rq->expired->prio_bitmap; #ifdef CONFIG_SMP rq->sd = NULL; @@ -6905,14 +6915,18 @@ void __init sched_init(void) array = rq->arrays + j; for (k = 0; k < MAX_PRIO; k++) INIT_LIST_HEAD(array->queue + k); + bitmap_zero(array->prio_bitmap, MAX_PRIO); + /* delimiter for bitsearch */ + __set_bit(MAX_PRIO, array->prio_bitmap); } for (k = 0; k < PRIO_RANGE; k++) rq->prio_quota[k] = 0; - bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO); - bitmap_zero(rq->static_bitmap, MAX_PRIO); - /* delimiter for bitsearch */ - __set_bit(MAX_DYN_PRIO, rq->dyn_bitmap); + + /* Every added cpu increases the rr_interval */ + rr_us += rr_inc; + rr_inc /= 2; } + rr_interval = rr_us / 1000 ? : 1; set_load_weight(&init_task);