--- Documentation/sched-design.txt | 128 ++++-------- include/linux/init_task.h | 4 include/linux/list.h | 42 ---- include/linux/sched.h | 14 - kernel/sched.c | 416 ++++++++++++----------------------------- 5 files changed, 177 insertions(+), 427 deletions(-) Index: linux-2.6.20.4-rsdl/Documentation/sched-design.txt =================================================================== --- linux-2.6.20.4-rsdl.orig/Documentation/sched-design.txt 2007-03-27 23:28:25.000000000 +1000 +++ linux-2.6.20.4-rsdl/Documentation/sched-design.txt 2007-03-27 23:28:27.000000000 +1000 @@ -1,14 +1,14 @@ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by - Ingo Molnar and the Rotating Staircase Deadline cpu scheduler policy - designed by Con Kolivas. + Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by + Con Kolivas. This was originally an edited version of an email Ingo Molnar sent to lkml on 4 Jan 2002. It describes the goals, design, and implementation of Ingo's ultra-scalable O(1) scheduler. It now contains a description - of the Rotating Staircase Deadline priority scheduler that was built on - this design. - Last Updated: Sun Feb 25 2007 + of the Staircase Deadline priority scheduler that was built on this + design. + Last Updated: Tue Mar 27 2007 Goal @@ -168,15 +168,15 @@ code is smaller than the old one. Ingo -Rotating Staircase Deadline cpu scheduler policy +Staircase Deadline cpu scheduler policy ================================================ Design summary ============== A novel design which incorporates a foreground-background descending priority -system (the staircase) with runqueue managed minor and major epochs (rotation -and deadline). +system (the staircase) via a bandwidth allocation matrix according to nice +level. Features @@ -196,113 +196,66 @@ constraints of strict fairness. Design description ================== -RSDL works off the principle of providing each task a quota of runtime that it -is allowed to run at a number of priority levels determined by its static -priority (ie. its nice level). When each task is queued, the cpu that it is -queued onto also keeps a record of that quota. If the task uses up its quota it -has its priority decremented to the next level. Also, if the cpu notices a quota -full has been used for that priority level, it pushes everything remaining at -that priority level to the next lowest priority level. Once every runtime quota -has been consumed of every priority level, a task is queued on the "expired" -array. When no other tasks exist with quota, the expired array is activated and -fresh quotas are handed out. This is all done in O(1). - +SD works off the principle of providing each task a quota of runtime that it is +allowed to run at a number of priority levels determined by its static priority +(ie. its nice level). If the task uses up its quota it has its priority +decremented to the next level determined by a priority matrix. Once every +runtime quota has been consumed of every priority level, a task is queued on the +"expired" array. When no other tasks exist with quota, the expired array is +activated and fresh quotas are handed out. This is all done in O(1). Design details ============== -Each cpu has its own runqueue which micromanages its own epochs, and each -task keeps a record of its own entitlement of cpu time. Most of the rest -of these details apply to non-realtime tasks as rt task management is -straight forward. +Each task keeps a record of its own entitlement of cpu time. Most of the rest of +these details apply to non-realtime tasks as rt task management is straight +forward. Each runqueue keeps a record of what major epoch it is up to in the rq->prio_rotation field which is incremented on each major epoch. It also -keeps a record of quota available to each priority value valid for that -major epoch in rq->prio_quota[]. +keeps a record of the current prio_level for each static priority task. Each task keeps a record of what major runqueue epoch it was last running on in p->rotation. It also keeps a record of what priority levels it has already been allocated quota from during this epoch in a bitmap p->bitmap. The only tunable that determines all other details is the RR_INTERVAL. This -is set to 8ms (minimum on 1000HZ, higher at different HZ values), and is -scaled gently upwards with more cpus. +is set to 8ms, and is scaled gently upwards with more cpus. This value is +tunable via a /proc interface. All tasks are initially given a quota based on RR_INTERVAL. This is equal to -RR_INTERVAL between nice values of 0 and 19, and progressively larger for nice -values from -1 to -20. This is to maintain a relationship of nice 19 having -approximately 1/20th of the cpu of nice 0, and nice 0 having 1/20th the cpu of -nice -20. This is assigned to p->quota and only changes with changes in nice -level. +RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and +progressively larger for nice values from -1 to -20. This is assigned to +p->quota and only changes with changes in nice level. As a task is first queued, it checks in recalc_task_prio to see if it has run at this runqueue's current priority rotation. If it has not, it will have its p->prio level set according to the first slot in a "priority matrix" and will be given a p->time_slice equal to the p->quota, and has its allocation bitmap bit -set in p->bitmap for this prio level. This quota is then also added to the -current runqueue's rq->prio_quota[p->prio]. It is then queued on the current -active priority array. +set in p->bitmap for this prio level. It is then queued on the current active +priority array. If a task has already been running during this major epoch, and it has p->time_slice left and the rq->prio_quota for the task's p->prio still has quota, it will be placed back on the active array, but no more quota -will be added to either the task or the runqueue quota. +will be added. If a task has been running during this major epoch, but does not have -p->time_slice left or the runqueue's prio_quota for this task's p->prio -does not have quota, it will find the next lowest priority in its bitmap -that it has not been allocated quota from. It then gets the a full quota -in p->time_slice and adds that to the quota value for the relevant priority -rq->prio_quota. It is then queued on the current active priority array at -the newly determined lower priority. +p->time_slice left, it will find the next lowest priority in its bitmap that it +has not been allocated quota from. It then gets the a full quota in +p->time_slice. It is then queued on the current active priority array at the +newly determined lower priority. If a task has been running during this major epoch, and does not have any entitlement left in p->bitmap and no time_slice left, it will have its -bitmap cleared, and be queued at its p->static_prio again, but on the expired -priority array. No quota will be allocated until this task is scheduled. +bitmap cleared, and be queued at its best prio again, but on the expired +priority array. When a task is queued, it has its relevant bit set in the array->prio_bitmap. -During a scheduler_tick where a task is running, the p->time_slice is -decremented, and if it reaches zero then the recalc_task_prio is readjusted -and the task rescheduled. - -During a task running tick, the runqueue prio_quota is also decremented. If -it empties then a priority rotation occurs (a major or minor epoch). If the -current runqueue's priority level is better than that of nice 19 tasks, a -minor rotation is performed, otherwise a major rotation will occur. - -A minor rotation takes the remaining tasks at this priority level queue and -merges them with a list_splice_tail with the queue from the next lowest -priority level. At this time, any tasks that have been merged will now -have invalid values in p->prio so this must be considered when dequeueing -and scheduling the task. - -A major rotation takes the remaining tasks at this priority level queue and -merges them with a list_splice_tail with the best priority task running on -the expired array, and swaps the priority arrays. The priority quotas are -reset at this time. Any tasks that have been merged will now have invalid -values in p->array and possibly p->prio so this must be considered. The -rq->prio_rotation is incremented at this time. - -When a task is dequeued, the dyn_bitmap bit is unset only after testing -that the relevant queue is actually empty since p->prio may be inaccurate -and no hard accounting of the number of tasks at that level is possible. - -When selecting a new task for scheduling, after the first dynamic bit is found -on the dyn_bitmap, it is checked to see that a task is really queued at that -priority or if it is a false positive due to the task being dequeued at a time -when its p->prio does not match which queue it is on after some form of priority -rotation. This is a rare occurrence as it tends to only occur if a task that is -already waiting on a runqueue gets dequeued. If no tasks remain on the active -array, a major priority rotation is performed. If the chosen task has not been -running during this major or minor rotation it has new quota allocated at this -time, and added to the runqueue's quota. - -If a task finds itself merged at a priority level that it does not normally -receive quota at (due to list merging) it will remove one of its normal -priority slots to compensate. +p->time_slice is stored in nanosconds and is updated via update_cpu_clock on +schedule() and scheduler_tick. If p->time_slice is below zero then the +recalc_task_prio is readjusted and the task rescheduled. Priority Matrix @@ -328,6 +281,10 @@ task only runs one slot per major rotati smallest possible maximum latencies between tasks of varying nice levels, thus allowing vastly different nice levels to be used. +SCHED_BATCH tasks are managed slightly differently, receiving only the top +slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but +slightly higher latencies. + Modelling deadline behaviour ============================ @@ -336,8 +293,7 @@ As the accounting in this design is hard calculations or interactivity modifiers, it is possible to accurately predict the maximum latency that a task may experience under different conditions. This is a virtual deadline mechanism enforced by mandatory -runqueue epochs, and not by trying to keep complicated accounting of each -task. +timeslice expiration and not outside bandwidth measurement. The maximum duration a task can run during one major epoch is determined by its nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL @@ -433,5 +389,5 @@ can be countered by running a gui at a n causing adversely large latencies in nice 0 tasks. -Fri, 16 Mar 2007 +Tue Mar 27 2007 Con Kolivas Index: linux-2.6.20.4-rsdl/include/linux/init_task.h =================================================================== --- linux-2.6.20.4-rsdl.orig/include/linux/init_task.h 2007-03-27 23:28:25.000000000 +1000 +++ linux-2.6.20.4-rsdl/include/linux/init_task.h 2007-03-27 23:28:27.000000000 +1000 @@ -109,8 +109,8 @@ extern struct group_info init_groups; .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ - .quota = HZ, \ + .time_slice = 1000000000, \ + .quota = 1000000000, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ Index: linux-2.6.20.4-rsdl/include/linux/list.h =================================================================== --- linux-2.6.20.4-rsdl.orig/include/linux/list.h 2007-03-27 23:28:25.000000000 +1000 +++ linux-2.6.20.4-rsdl/include/linux/list.h 2007-03-27 23:28:27.000000000 +1000 @@ -332,20 +332,6 @@ static inline void __list_splice(struct at->prev = last; } -static inline void __list_splice_tail(struct list_head *list, - struct list_head *head) -{ - struct list_head *first = list->next; - struct list_head *last = list->prev; - struct list_head *at = head->prev; - - first->prev = at; - at->next = first; - - last->next = head; - head->prev = last; -} - /** * list_splice - join two lists * @list: the new list to add. @@ -358,18 +344,6 @@ static inline void list_splice(struct li } /** - * list_splice_tail - join two lists at one's tail - * @list: the new list to add. - * @head: the place to add it in the first list. - */ -static inline void list_splice_tail(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) - __list_splice_tail(list, head); -} - -/** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. @@ -386,22 +360,6 @@ static inline void list_splice_init(stru } /** - * list_splice_tail_init - join 2 lists at one's tail & reinitialise emptied - * @list: the new list to add. - * @head: the place to add it in the first list. - * - * The list at @list is reinitialised - */ -static inline void list_splice_tail_init(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice_tail(list, head); - INIT_LIST_HEAD(list); - } -} - -/** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. Index: linux-2.6.20.4-rsdl/include/linux/sched.h =================================================================== --- linux-2.6.20.4-rsdl.orig/include/linux/sched.h 2007-03-27 23:28:25.000000000 +1000 +++ linux-2.6.20.4-rsdl/include/linux/sched.h 2007-03-27 23:28:27.000000000 +1000 @@ -822,17 +822,15 @@ struct task_struct { unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + int time_slice; /* * How much this task is entitled to run at the current priority - * before being requeued at a lower priority, and is this the very - * first time_slice this task has ever run. - */ - unsigned int quota; - /* - * How much this task contributes to the current priority queue - * length + * before being requeued at a lower priority. */ + int quota; + /* How much this task receives at each priority level */ + unsigned int first_time_slice; + /* Is this the very first time_slice this task has ever run. */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; Index: linux-2.6.20.4-rsdl/kernel/sched.c =================================================================== --- linux-2.6.20.4-rsdl.orig/kernel/sched.c 2007-03-27 23:28:25.000000000 +1000 +++ linux-2.6.20.4-rsdl/kernel/sched.c 2007-03-27 23:28:27.000000000 +1000 @@ -16,8 +16,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-03-02 Rotating Staircase deadline scheduling policy by Con Kolivas - * RSDL v0.33 + * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas */ #include @@ -77,11 +76,16 @@ #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) #define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) +/* Some helpers for converting to/from nanosecond timing */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define NS_TO_MS(TIME) ((TIME) / 1000000) +#define MS_TO_NS(TIME) ((TIME) * 1000000) + #define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) /* * This is the time all tasks within the same priority round robin. - * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ. + * Value is in ms and set to a minimum of 8ms. Scales with number of cpus. * Tunable via /proc interface. */ int rr_interval __read_mostly; @@ -107,15 +111,24 @@ int rr_interval __read_mostly; static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] __read_mostly; +struct rq; + struct prio_array { struct list_head queue[MAX_PRIO]; /* Tasks queued at each priority */ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); /* - * The bitmap of priorities queued; The dynamic bits can have - * false positives. Include 1 bit for delimiter. + * The bitmap of priorities queued for this array. While the expired + * array will never have realtime tasks on it, it is simpler to have + * equal sized bitmaps for a cheap array swap. Include 1 bit for + * delimiter. */ + +#ifdef CONFIG_SMP + struct rq *rq; + /* For convenience looks back at rq */ +#endif }; /* @@ -153,12 +166,6 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - long prio_quota[PRIO_RANGE]; - /* - * The quota of ticks the runqueue runs at each dynamic priority - * before cycling to the next priority. - */ - struct prio_array *active, *expired, arrays[2]; unsigned long *dyn_bitmap, *exp_bitmap; @@ -613,26 +620,13 @@ static inline int task_queued(struct tas return !list_empty(&task->run_list); } -static inline void set_task_entitlement(struct task_struct *p) -{ - __set_bit(USER_PRIO(p->prio), p->bitmap); - p->time_slice = p->quota; -} - -/* - * There is no specific hard accounting. The dynamic bits can have - * false positives. rt_tasks can only be on the active queue. - */ static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) { __set_bit(p->prio, p->array->prio_bitmap); } /* - * Removing from a runqueue. While we don't know with absolute certainty - * where this task really is, the p->array and p->prio are very likely - * so we check that queue to see if we can clear that bit to take some - * load off finding false positives in next_dynamic_task(). + * Removing from a runqueue. */ static void dequeue_task(struct task_struct *p, struct rq *rq) { @@ -649,38 +643,37 @@ static inline void task_new_array(struct { bitmap_zero(p->bitmap, PRIO_RANGE); p->rotation = rq->prio_rotation; + p->time_slice = p->quota; } /* Find the first slot from the relevant prio_matrix entry */ static inline int first_prio_slot(struct task_struct *p) { + if (unlikely(p->policy == SCHED_BATCH)) + return p->static_prio; return SCHED_PRIO(find_first_zero_bit( prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); } -/* Is a dynamic_prio part of the allocated slots for this static_prio */ -static inline int entitled_slot(int static_prio, int dynamic_prio) -{ - return !test_bit(USER_PRIO(dynamic_prio), - prio_matrix[USER_PRIO(static_prio)]); -} - /* * Find the first unused slot by this task that is also in its prio_matrix - * level. Ensure that the prio_level is not unnecessarily low by checking - * that best_static_prio this major rotation was not a niced task. - * SCHED_BATCH tasks do not perform this check so they do not induce - * latencies in tasks of any nice level. + * level. SCHED_BATCH tasks do not use the priority matrix. They only take + * priority slots from their static_prio and above. */ static inline int next_entitled_slot(struct task_struct *p, struct rq *rq) { DECLARE_BITMAP(tmp, PRIO_RANGE); int search_prio; - if (p->static_prio < rq->best_static_prio && p->policy != SCHED_BATCH) + if (p->static_prio < rq->best_static_prio) search_prio = MAX_RT_PRIO; else search_prio = rq->prio_level; + if (unlikely(p->policy == SCHED_BATCH)) { + search_prio = max(search_prio, p->static_prio); + return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, + USER_PRIO(search_prio))); + } bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE); return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, @@ -693,30 +686,58 @@ static void queue_expired(struct task_st task_new_array(p, rq); p->prio = p->normal_prio = first_prio_slot(p); p->time_slice = p->quota; + p->rotation = rq->prio_rotation; } -#define rq_quota(rq, prio) ((rq)->prio_quota[USER_PRIO(prio)]) +#ifdef CONFIG_SMP +/* + * If we're waking up a task that was previously on a different runqueue, + * update its data appropriately. Note we may be reading data from src_rq-> + * outside of lock, but the occasional inaccurate result should be harmless. + */ + static inline void update_if_moved(struct task_struct *p, struct rq *rq) +{ + struct rq *src_rq = p->array->rq; + + if (src_rq == rq) + return; + if (p->rotation == src_rq->prio_rotation) + p->rotation = rq->prio_rotation; + else + p->rotation = 0; + if (p->array == src_rq->expired) + p->array = rq->expired; + else + p->array = rq->active; +} +#else +static inline void update_if_moved(struct task_struct *p, struct rq *rq) +{ +} +#endif /* - * recalc_task_prio determines what prio a non rt_task will be + * recalc_task_prio determines what priority a non rt_task will be * queued at. If the task has already been running during this runqueue's * major rotation (rq->prio_rotation) then it continues at the same * priority if it has tick entitlement left. If it does not have entitlement * left, it finds the next priority slot according to its nice value that it * has not extracted quota from. If it has not run during this major - * rotation, it starts at its static priority and has its bitmap quota + * rotation, it starts at the next_entitled_slot and has its bitmap quota * cleared. If it does not have any slots left it has all its slots reset and - * is queued on the expired at its static priority. + * is queued on the expired at its first_prio_slot. */ static void recalc_task_prio(struct task_struct *p, struct rq *rq) { struct prio_array *array = rq->active; int queue_prio; + update_if_moved(p, rq); if (p->rotation == rq->prio_rotation) { if (p->array == array) { - if (p->time_slice && rq_quota(rq, p->prio)) + if (p->time_slice > 0) return; + p->time_slice = p->quota; } else if (p->array == rq->expired) { queue_expired(p, rq); return; @@ -730,17 +751,14 @@ static void recalc_task_prio(struct task queue_expired(p, rq); return; } - rq_quota(rq, queue_prio) += p->quota; p->prio = p->normal_prio = queue_prio; p->array = array; - set_task_entitlement(p); + __set_bit(USER_PRIO(p->prio), p->bitmap); } /* * Adding to a runqueue. The dynamic priority queue that it is added to is - * determined by the priority rotation of the runqueue it is being added to - * and the quota still available in the task in p->bitmap and p->time_slice - * (see recalc_task_prio above). + * determined by recalc_task_prio() above. */ static inline void __enqueue_task(struct task_struct *p, struct rq *rq) { @@ -795,13 +813,14 @@ static void requeue_task(struct task_str * task_timeslice - the total duration a task can run during one major * rotation. */ -static inline unsigned int task_timeslice(struct task_struct *p) +static inline int task_timeslice(struct task_struct *p) { - unsigned int slice, rr; + int slice, rr; slice = rr = p->quota; if (!rt_task(p)) slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr; + slice = NS_TO_JIFFIES(slice) ? : 1; return slice; } @@ -908,22 +927,24 @@ static int effective_prio(struct task_st } /* - * All tasks have quotas based on rr_interval. From nice 0 to 19 they are - * all equal to it and below zero they get exponentially larger making their - * effective quota significantly larger. rt tasks all get rr_interval. - * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval - * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0 - * similar to the ratios between 0 and +19. + * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. + * From nice 1 to 19 they are smaller than it only if they are at least one + * tick still. Below nice 0 they get progressively larger. + * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval + * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. */ static unsigned int rr_quota(struct task_struct *p) { int nice = TASK_NICE(p), rr = rr_interval; - if (nice < -6 && !rt_task(p)) { - rr *= nice * nice; - rr /= 40; + if (!rt_task(p)) { + if (nice < -6) { + rr *= nice * nice; + rr /= 40; + } else if (nice > 0 && (rr * HZ / 1000 / 2) > 0) + rr /= 2; } - return rr; + return MS_TO_NS(rr); } /* @@ -1537,7 +1558,6 @@ int fastcall wake_up_state(struct task_s return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p, int tick); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1583,8 +1603,9 @@ void fastcall sched_fork(struct task_str * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + if (unlikely(p->time_slice < 2)) + p->time_slice = 2; + p->time_slice = current->time_slice >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. @@ -1592,16 +1613,6 @@ void fastcall sched_fork(struct task_str p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); - if (!current->time_slice) { - /* - * This case happens when the parent has only a single jiffy - * left from its timeslice. Taking the runqueue lock is not - * a problem. - */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current, 0); - } - local_irq_enable(); out: put_cpu(); } @@ -2006,43 +2017,6 @@ void sched_exec(void) } /* - * This is a unique version of enqueue_task for the SMP case where a task - * has just been moved across runqueues. It uses the information from the - * old runqueue to help it make a decision much like recalc_task_prio. As - * the new runqueue is almost certainly at a different prio_level than the - * src_rq it is cheapest just to pick the next entitled slot. - */ -static inline void enqueue_pulled_task(struct rq *src_rq, struct rq *rq, - struct task_struct *p) -{ - int queue_prio; - - p->array = rq->active; - if (!rt_task(p)) { - if (p->rotation == src_rq->prio_rotation) { - if (p->array == src_rq->expired) { - queue_expired(p, rq); - goto out_queue; - } - } else - task_new_array(p, rq); - } - queue_prio = next_entitled_slot(p, rq); - if (queue_prio >= MAX_PRIO) { - queue_expired(p, rq); - goto out_queue; - } - rq_quota(rq, queue_prio) += p->quota; - p->prio = queue_prio; -out_queue: - p->normal_prio = p->prio; - p->rotation = rq->prio_rotation; - sched_info_queued(p); - set_dynamic_bit(p, rq); - list_add_tail(&p->run_list, p->array->queue + p->prio); -} - -/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ @@ -2053,7 +2027,7 @@ static void pull_task(struct rq *src_rq, dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); - enqueue_pulled_task(src_rq, this_rq, p); + enqueue_task(p, this_rq); p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + this_rq->most_recent_timestamp; try_preempt(p, this_rq); @@ -2993,7 +2967,12 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - p->last_ran; + cputime64_t time_diff = now - p->last_ran; + + /* cpu scheduler quota accounting is performed here */ + if (p != rq->idle && p->policy != SCHED_FIFO) + p->time_slice -= time_diff; + p->sched_time += time_diff; p->last_ran = rq->most_recent_timestamp = now; } @@ -3095,7 +3074,6 @@ static void task_expired_entitlement(str struct prio_array *old_array; int old_prio; - set_tsk_need_resched(p); if (unlikely(p->first_time_slice)) p->first_time_slice = 0; if (rt_task(p)) { @@ -3109,122 +3087,21 @@ static void task_expired_entitlement(str requeue_task(p, rq, old_array, old_prio); } -/* - * A major priority rotation occurs when all priority quotas for this array - * have been exhausted. - */ -static inline void major_prio_rotation(struct rq *rq) -{ - struct prio_array *new_array = rq->expired; - - rq->expired = rq->active; - rq->active = new_array; - rq->exp_bitmap = rq->expired->prio_bitmap; - rq->dyn_bitmap = rq->active->prio_bitmap; - rq->best_static_prio = MAX_PRIO - 1; - rq->prio_rotation++; -} - -/* - * This is the heart of the virtual deadline priority management. - * - * We have used up the quota allocated to this priority level so we rotate - * the prio_level of the runqueue to the next lowest priority. We merge any - * remaining tasks at this level current_queue with the next priority and - * reset this level's queue. MAX_PRIO - 1 is a special case where we perform - * a major rotation. - */ -static inline void rotate_runqueue_priority(struct rq *rq) +/* This manages tasks that have run out of timeslice during a scheduler_tick */ +static void task_running_tick(struct rq *rq, struct task_struct *p) { - int new_prio_level; - struct prio_array *array; - - /* - * Make sure we don't have tasks still on the active array that - * haven't run due to not preempting a lower priority task. This can - * happen on list merging or smp balancing. - */ - if (unlikely(sched_find_first_bit(rq->dyn_bitmap) < rq->prio_level)) - return; - - array = rq->active; - if (rq->prio_level > MAX_PRIO - 2) { - /* Major rotation required */ - struct prio_array *new_queue = rq->expired; - - /* - * On a major rotation we move everything remaining to best - * priority on the new array. The priority matrix bitmap will - * ensure tasks only get the slots each static priority - * deserves. - */ - new_prio_level = MAX_RT_PRIO; - if (!list_empty(array->queue + rq->prio_level)) { - list_splice_tail_init(array->queue + rq->prio_level, - new_queue->queue + new_prio_level); - } - memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota)); - major_prio_rotation(rq); - } else { - /* Minor rotation */ - new_prio_level = rq->prio_level + 1; - __clear_bit(rq->prio_level, rq->dyn_bitmap); - if (!list_empty(array->queue + rq->prio_level)) { - list_splice_tail_init(array->queue + rq->prio_level, - array->queue + new_prio_level); - __set_bit(new_prio_level, rq->dyn_bitmap); - } - rq_quota(rq, rq->prio_level) = 0; - } - rq->prio_level = new_prio_level; - /* - * As we are merging to a prio_level that may not have anything in - * its quota we add 1 to ensure the tasks get to run in schedule() to - * add their quota to it. - */ - rq_quota(rq, new_prio_level) += 1; -} - -static void task_running_tick(struct rq *rq, struct task_struct *p, int tick) -{ - if (unlikely(!task_queued(p))) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); - return; - } /* SCHED_FIFO tasks never run out of timeslice. */ - if (unlikely(p->policy == SCHED_FIFO)) + if (p->time_slice > 0 || p->policy == SCHED_FIFO) return; - spin_lock(&rq->lock); - /* - * Accounting is performed by both the task and the runqueue. This - * allows frequently sleeping tasks to get their proper quota of - * cpu as the runqueue will have their quota still available at - * the appropriate priority level. It also means frequently waking - * tasks that might miss the scheduler_tick() will get forced down - * priority regardless. - */ - if (!--p->time_slice) - task_expired_entitlement(rq, p); - /* - * If we're actually calling this function not in a scheduler_tick - * we are doing so to fix accounting across fork and should not be - * deducting anything from rq_quota. - */ - if (!tick) - goto out_unlock; - /* - * We only employ the deadline mechanism if we run over the quota. - * It allows aliasing problems around the scheduler_tick to be - * less harmful. - */ - if (!rt_task(p) && --rq_quota(rq, rq->prio_level) < 0) { - if (unlikely(p->first_time_slice)) - p->first_time_slice = 0; - rotate_runqueue_priority(rq); + if (unlikely(!task_queued(p))) { + /* Task has expired but was not scheduled off yet */ set_tsk_need_resched(p); + goto out_unlock; } + /* p->time_slice <= 0 */ + task_expired_entitlement(rq, p); + set_tsk_need_resched(p); out_unlock: spin_unlock(&rq->lock); } @@ -3246,7 +3123,7 @@ void scheduler_tick(void) /* Task on the idle queue */ wake_priority_sleeper(rq); else - task_running_tick(rq, p, 1); + task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); if (time_after_eq(jiffies, rq->next_balance)) @@ -3429,81 +3306,42 @@ EXPORT_SYMBOL(sub_preempt_count); #endif /* - * If a task is queued at a priority that isn't from its bitmap we exchange - * by setting one of the entitlement bits. - */ -static inline void exchange_slot(struct task_struct *p, struct rq *rq) -{ - int slot = next_entitled_slot(p, rq); - - if (slot < MAX_PRIO) - __set_bit(USER_PRIO(slot), p->bitmap); -} - -/* - * next_dynamic_task finds the next suitable dynamic task. As the dyn_bitmap - * contains all the active and expired dynamic tasks sequentially we only - * need to do one bitmap lookup. + * next_dynamic_task finds the next suitable dynamic task. */ static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) { struct task_struct *next; struct list_head *queue; struct prio_array *array = rq->active; - int expirations = 0; retry: if (idx >= MAX_PRIO) { - BUG_ON(++expirations > 1); - /* - * We have selected a bit from the expired range so there are - * no more tasks in the active array. - */ - major_prio_rotation(rq); - array = rq->active; + /* There are no more tasks in the active array. Swap arrays */ + array = rq->expired; + rq->expired = rq->active; + rq->active = array; + rq->exp_bitmap = rq->expired->prio_bitmap; + rq->dyn_bitmap = rq->active->prio_bitmap; + rq->best_static_prio = MAX_PRIO - 1; + rq->prio_rotation++; idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); } - if (unlikely(list_empty(array->queue + idx))) { + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); + if (unlikely(next->time_slice < 0)) { /* - * This can happen because they are not always cleared on - * dequeue_task since they may have been dequeued while - * waiting on a runqueue and a rotation has occurred in the - * interim. A very rare occurrence. + * Unlucky enough that this task ran out of time_slice + * before it hit a scheduler_tick so it should have its + * priority reassessed and choose another task (possibly + * the same one) */ - __clear_bit(idx, rq->dyn_bitmap); - idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1); + task_expired_entitlement(rq, next); + idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); goto retry; } - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); rq->prio_level = idx; - /* - * When the task is chosen it is checked to see if its quota has been - * added to this runqueue level which is only performed once per - * level per major rotation for each running task. - */ - if (next->rotation != rq->prio_rotation) { - /* Task has moved during major rotation */ - task_new_array(next, rq); - if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, rq); - set_task_entitlement(next); - rq_quota(rq, idx) += next->quota; - } else if (!test_bit(USER_PRIO(idx), next->bitmap)) { - /* Task has moved during minor rotation */ - if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, rq); - set_task_entitlement(next); - rq_quota(rq, idx) += next->quota; - } - /* - * next needs to have its prio and array reset here in case the - * values are wrong due to priority rotation. - */ - next->prio = idx; - next->array = array; - if (next->static_prio < rq->best_static_prio && - next->policy != SCHED_BATCH) + next->rotation = rq->prio_rotation; + if (next->static_prio < rq->best_static_prio) rq->best_static_prio = next->static_prio; return next; } @@ -4181,7 +4019,7 @@ asmlinkage long sys_nice(int increment) * * This is the priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from 0 to +19. + * around 0, value goes from 0 to +39. */ int task_prio(const struct task_struct *p) { @@ -4971,9 +4809,9 @@ void __cpuinit init_idle(struct task_str struct rq *rq = cpu_rq(cpu); unsigned long flags; - bitmap_zero(idle->bitmap, PRIO_RANGE + 1); - idle->timestamp = sched_clock(); - idle->array = NULL; + bitmap_zero(idle->bitmap, PRIO_RANGE); + idle->timestamp = idle->last_ran = sched_clock(); + idle->array = rq->active; idle->prio = idle->normal_prio = NICE_TO_PRIO(0); idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); @@ -6963,6 +6801,8 @@ void __init sched_init(void) rq->exp_bitmap = rq->expired->prio_bitmap; #ifdef CONFIG_SMP + rq->active->rq = rq; + rq->expired->rq = rq; rq->sd = NULL; for (j = 1; j < 3; j++) rq->cpu_load[j] = 0; @@ -6983,14 +6823,12 @@ void __init sched_init(void) /* delimiter for bitsearch */ __set_bit(MAX_PRIO, array->prio_bitmap); } - for (k = 0; k < PRIO_RANGE; k++) - rq->prio_quota[k] = 0; /* Every added cpu increases the rr_interval */ rr_us += rr_inc; rr_inc /= 2; } - rr_interval = rr_us / 1000 ? : 1; + rr_interval = rr_us / 1000; set_load_weight(&init_task);