--- include/linux/init_task.h | 4 include/linux/list.h | 42 ------ include/linux/sched.h | 14 -- kernel/sched.c | 317 +++++++++++----------------------------------- 4 files changed, 85 insertions(+), 292 deletions(-) Index: linux-2.6.20.4-rsdl/include/linux/sched.h =================================================================== --- linux-2.6.20.4-rsdl.orig/include/linux/sched.h 2007-03-26 10:01:32.000000000 +1000 +++ linux-2.6.20.4-rsdl/include/linux/sched.h 2007-03-26 10:02:52.000000000 +1000 @@ -822,17 +822,15 @@ struct task_struct { unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + int time_slice; /* * How much this task is entitled to run at the current priority - * before being requeued at a lower priority, and is this the very - * first time_slice this task has ever run. - */ - unsigned int quota; - /* - * How much this task contributes to the current priority queue - * length + * before being requeued at a lower priority. */ + int quota; + /* How much this task receives at each priority level */ + unsigned int first_time_slice; + /* Is this the very first time_slice this task has ever run. */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; Index: linux-2.6.20.4-rsdl/kernel/sched.c =================================================================== --- linux-2.6.20.4-rsdl.orig/kernel/sched.c 2007-03-26 10:01:05.000000000 +1000 +++ linux-2.6.20.4-rsdl/kernel/sched.c 2007-03-26 10:04:32.000000000 +1000 @@ -16,8 +16,8 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-03-02 Rotating Staircase deadline scheduling policy by Con Kolivas - * RSDL v0.33 + * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas + * SD v0.34 */ #include @@ -83,12 +83,14 @@ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) #define JIFFY_NS JIFFIES_TO_NS(1) +#define NS_TO_MS(TIME) ((TIME) / 1000000) +#define MS_TO_NS(TIME) ((TIME) * 1000000) #define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) /* * This is the time all tasks within the same priority round robin. - * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ. + * Value is in ms and set to a minimum of 8ms. Scales with number of cpus. * Tunable via /proc interface. */ int rr_interval __read_mostly; @@ -120,8 +122,10 @@ struct prio_array { DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); /* - * The bitmap of priorities queued; The dynamic bits can have - * false positives. Include 1 bit for delimiter. + * The bitmap of priorities queued for this array. While the expired + * array will never have realtime tasks on it, it is simpler to have + * equal sized bitmaps for a cheap array swap. Include 1 bit for + * delimiter. */ }; @@ -160,12 +164,6 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - long prio_quota[PRIO_RANGE]; - /* - * The quota of ticks the runqueue runs at each dynamic priority - * before cycling to the next priority. - */ - struct prio_array *active, *expired, arrays[2]; unsigned long *dyn_bitmap, *exp_bitmap; @@ -620,26 +618,13 @@ static inline int task_queued(struct tas return !list_empty(&task->run_list); } -static inline void set_task_entitlement(struct task_struct *p) -{ - __set_bit(USER_PRIO(p->prio), p->bitmap); - p->time_slice = p->quota; -} - -/* - * There is no specific hard accounting. The dynamic bits can have - * false positives. rt_tasks can only be on the active queue. - */ static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) { __set_bit(p->prio, p->array->prio_bitmap); } /* - * Removing from a runqueue. While we don't know with absolute certainty - * where this task really is, the p->array and p->prio are very likely - * so we check that queue to see if we can clear that bit to take some - * load off finding false positives in next_dynamic_task(). + * Removing from a runqueue. */ static void dequeue_task(struct task_struct *p, struct rq *rq) { @@ -656,6 +641,7 @@ static inline void task_new_array(struct { bitmap_zero(p->bitmap, PRIO_RANGE); p->rotation = rq->prio_rotation; + p->time_slice = p->quota; } /* Find the first slot from the relevant prio_matrix entry */ @@ -665,13 +651,6 @@ static inline int first_prio_slot(struct prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); } -/* Is a dynamic_prio part of the allocated slots for this static_prio */ -static inline int entitled_slot(int static_prio, int dynamic_prio) -{ - return !test_bit(USER_PRIO(dynamic_prio), - prio_matrix[USER_PRIO(static_prio)]); -} - /* * Find the first unused slot by this task that is also in its prio_matrix * level. Ensure that the prio_level is not unnecessarily low by checking @@ -700,20 +679,19 @@ static void queue_expired(struct task_st task_new_array(p, rq); p->prio = p->normal_prio = first_prio_slot(p); p->time_slice = p->quota; + p->rotation = rq->prio_rotation; } -#define rq_quota(rq, prio) ((rq)->prio_quota[USER_PRIO(prio)]) - /* - * recalc_task_prio determines what prio a non rt_task will be + * recalc_task_prio determines what priority a non rt_task will be * queued at. If the task has already been running during this runqueue's * major rotation (rq->prio_rotation) then it continues at the same * priority if it has tick entitlement left. If it does not have entitlement * left, it finds the next priority slot according to its nice value that it * has not extracted quota from. If it has not run during this major - * rotation, it starts at its static priority and has its bitmap quota + * rotation, it starts at the next_entitled_slot and has its bitmap quota * cleared. If it does not have any slots left it has all its slots reset and - * is queued on the expired at its static priority. + * is queued on the expired at its first_prio_slot. */ static void recalc_task_prio(struct task_struct *p, struct rq *rq) { @@ -722,8 +700,9 @@ static void recalc_task_prio(struct task if (p->rotation == rq->prio_rotation) { if (p->array == array) { - if (p->time_slice && rq_quota(rq, p->prio)) + if (p->time_slice > 0) return; + p->time_slice = p->quota; } else if (p->array == rq->expired) { queue_expired(p, rq); return; @@ -737,17 +716,14 @@ static void recalc_task_prio(struct task queue_expired(p, rq); return; } - rq_quota(rq, queue_prio) += p->quota; p->prio = p->normal_prio = queue_prio; p->array = array; - set_task_entitlement(p); + __set_bit(USER_PRIO(p->prio), p->bitmap); } /* * Adding to a runqueue. The dynamic priority queue that it is added to is - * determined by the priority rotation of the runqueue it is being added to - * and the quota still available in the task in p->bitmap and p->time_slice - * (see recalc_task_prio above). + * determined by recalc_task_prio() above. */ static inline void __enqueue_task(struct task_struct *p, struct rq *rq) { @@ -802,13 +778,14 @@ static void requeue_task(struct task_str * task_timeslice - the total duration a task can run during one major * rotation. */ -static inline unsigned int task_timeslice(struct task_struct *p) +static inline int task_timeslice(struct task_struct *p) { - unsigned int slice, rr; + int slice, rr; slice = rr = p->quota; if (!rt_task(p)) slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr; + slice = NS_TO_JIFFIES(slice) ? : 1; return slice; } @@ -915,22 +892,24 @@ static int effective_prio(struct task_st } /* - * All tasks have quotas based on rr_interval. From nice 0 to 19 they are - * all equal to it and below zero they get exponentially larger making their - * effective quota significantly larger. rt tasks all get rr_interval. - * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval - * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0 - * similar to the ratios between 0 and +19. + * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. + * From nice 1 to 19 they are smaller than it only if they are at least one + * tick still. Below nice 0 they get progressively larger. + * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval + * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. */ static unsigned int rr_quota(struct task_struct *p) { int nice = TASK_NICE(p), rr = rr_interval; - if (nice < -6 && !rt_task(p)) { - rr *= nice * nice; - rr /= 40; + if (!rt_task(p)) { + if (nice < -6) { + rr *= nice * nice; + rr /= 40; + } else if (nice > 0 && (rr * HZ / 1000 / 2) > 0) + rr /= 2; } - return rr; + return MS_TO_NS(rr); } /* @@ -1544,7 +1523,7 @@ int fastcall wake_up_state(struct task_s return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p, int tick); +static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1591,7 +1570,9 @@ void fastcall sched_fork(struct task_str * resulting in more scheduling fairness. */ local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + if (unlikely(p->time_slice < 2)) + p->time_slice = 2; + p->time_slice = current->time_slice >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. @@ -1599,15 +1580,6 @@ void fastcall sched_fork(struct task_str p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); - if (!current->time_slice) { - /* - * This case happens when the parent has only a single jiffy - * left from its timeslice. Taking the runqueue lock is not - * a problem. - */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current, 0); - } local_irq_enable(); out: put_cpu(); @@ -2031,6 +2003,8 @@ static inline void enqueue_pulled_task(s queue_expired(p, rq); goto out_queue; } + if (p->time_slice < 0) + task_new_array(p, rq); } else task_new_array(p, rq); } @@ -2039,7 +2013,6 @@ static inline void enqueue_pulled_task(s queue_expired(p, rq); goto out_queue; } - rq_quota(rq, queue_prio) += p->quota; p->prio = queue_prio; out_queue: p->normal_prio = p->prio; @@ -3039,6 +3012,10 @@ update_cpu_clock(struct task_struct *p, p->utime = cputime_add(p->utime, jiffies_to_cputime(1)); } + /* cpu scheduler quota accounting is performed here */ + if (p->policy != SCHED_FIFO) + p->time_slice -= time_diff; + } else { cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff); if (cpustat->idle_ns > JIFFY_NS) { @@ -3151,7 +3128,6 @@ static void task_expired_entitlement(str struct prio_array *old_array; int old_prio; - set_tsk_need_resched(p); if (unlikely(p->first_time_slice)) p->first_time_slice = 0; if (rt_task(p)) { @@ -3165,122 +3141,23 @@ static void task_expired_entitlement(str requeue_task(p, rq, old_array, old_prio); } -/* - * A major priority rotation occurs when all priority quotas for this array - * have been exhausted. - */ -static inline void major_prio_rotation(struct rq *rq) -{ - struct prio_array *new_array = rq->expired; - - rq->expired = rq->active; - rq->active = new_array; - rq->exp_bitmap = rq->expired->prio_bitmap; - rq->dyn_bitmap = rq->active->prio_bitmap; - rq->best_static_prio = MAX_PRIO - 1; - rq->prio_rotation++; -} - -/* - * This is the heart of the virtual deadline priority management. - * - * We have used up the quota allocated to this priority level so we rotate - * the prio_level of the runqueue to the next lowest priority. We merge any - * remaining tasks at this level current_queue with the next priority and - * reset this level's queue. MAX_PRIO - 1 is a special case where we perform - * a major rotation. - */ -static inline void rotate_runqueue_priority(struct rq *rq) -{ - int new_prio_level; - struct prio_array *array; - - /* - * Make sure we don't have tasks still on the active array that - * haven't run due to not preempting a lower priority task. This can - * happen on list merging or smp balancing. - */ - if (unlikely(sched_find_first_bit(rq->dyn_bitmap) < rq->prio_level)) - return; - - array = rq->active; - if (rq->prio_level > MAX_PRIO - 2) { - /* Major rotation required */ - struct prio_array *new_queue = rq->expired; - - /* - * On a major rotation we move everything remaining to best - * priority on the new array. The priority matrix bitmap will - * ensure tasks only get the slots each static priority - * deserves. - */ - new_prio_level = MAX_RT_PRIO; - if (!list_empty(array->queue + rq->prio_level)) { - list_splice_tail_init(array->queue + rq->prio_level, - new_queue->queue + new_prio_level); - } - memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota)); - major_prio_rotation(rq); - } else { - /* Minor rotation */ - new_prio_level = rq->prio_level + 1; - __clear_bit(rq->prio_level, rq->dyn_bitmap); - if (!list_empty(array->queue + rq->prio_level)) { - list_splice_tail_init(array->queue + rq->prio_level, - array->queue + new_prio_level); - __set_bit(new_prio_level, rq->dyn_bitmap); - } - rq_quota(rq, rq->prio_level) = 0; - } - rq->prio_level = new_prio_level; - /* - * As we are merging to a prio_level that may not have anything in - * its quota we add 1 to ensure the tasks get to run in schedule() to - * add their quota to it. - */ - rq_quota(rq, new_prio_level) += 1; -} - -static void task_running_tick(struct rq *rq, struct task_struct *p, int tick) +static void task_running_tick(struct rq *rq, struct task_struct *p) { - if (unlikely(!task_queued(p))) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); - return; - } /* SCHED_FIFO tasks never run out of timeslice. */ if (unlikely(p->policy == SCHED_FIFO)) return; + if (p->time_slice > 0) + return; spin_lock(&rq->lock); - /* - * Accounting is performed by both the task and the runqueue. This - * allows frequently sleeping tasks to get their proper quota of - * cpu as the runqueue will have their quota still available at - * the appropriate priority level. It also means frequently waking - * tasks that might miss the scheduler_tick() will get forced down - * priority regardless. - */ - if (!--p->time_slice) - task_expired_entitlement(rq, p); - /* - * If we're actually calling this function not in a scheduler_tick - * we are doing so to fix accounting across fork and should not be - * deducting anything from rq_quota. - */ - if (!tick) - goto out_unlock; - /* - * We only employ the deadline mechanism if we run over the quota. - * It allows aliasing problems around the scheduler_tick to be - * less harmful. - */ - if (!rt_task(p) && --rq_quota(rq, rq->prio_level) < 0) { - if (unlikely(p->first_time_slice)) - p->first_time_slice = 0; - rotate_runqueue_priority(rq); + if (unlikely(!task_queued(p))) { + /* Task has expired but was not scheduled off yet */ set_tsk_need_resched(p); + goto out_unlock; } + /* p->time_slice <= 0 */ + task_expired_entitlement(rq, p); + set_tsk_need_resched(p); out_unlock: spin_unlock(&rq->lock); } @@ -3302,7 +3179,7 @@ void scheduler_tick(void) /* Task on the idle queue */ wake_priority_sleeper(rq); else - task_running_tick(rq, p, 1); + task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); if (time_after_eq(jiffies, rq->next_balance)) @@ -3485,79 +3362,41 @@ EXPORT_SYMBOL(sub_preempt_count); #endif /* - * If a task is queued at a priority that isn't from its bitmap we exchange - * by setting one of the entitlement bits. - */ -static inline void exchange_slot(struct task_struct *p, struct rq *rq) -{ - int slot = next_entitled_slot(p, rq); - - if (slot < MAX_PRIO) - __set_bit(USER_PRIO(slot), p->bitmap); -} - -/* - * next_dynamic_task finds the next suitable dynamic task. As the dyn_bitmap - * contains all the active and expired dynamic tasks sequentially we only - * need to do one bitmap lookup. + * next_dynamic_task finds the next suitable dynamic task. */ static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) { struct task_struct *next; struct list_head *queue; struct prio_array *array = rq->active; - int expirations = 0; retry: if (idx >= MAX_PRIO) { - BUG_ON(++expirations > 1); - /* - * We have selected a bit from the expired range so there are - * no more tasks in the active array. - */ - major_prio_rotation(rq); - array = rq->active; + /* There are no more tasks in the active array. Swap arrays */ + array = rq->expired; + rq->expired = rq->active; + rq->active = array; + rq->exp_bitmap = rq->expired->prio_bitmap; + rq->dyn_bitmap = rq->active->prio_bitmap; + rq->best_static_prio = MAX_PRIO - 1; + rq->prio_rotation++; idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); } - if (unlikely(list_empty(array->queue + idx))) { + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); + if (unlikely(next->time_slice < 0)) { /* - * This can happen because they are not always cleared on - * dequeue_task since they may have been dequeued while - * waiting on a runqueue and a rotation has occurred in the - * interim. A very rare occurrence. + * Unlucky enough that this task ran out of time_slice + * before it hit a scheduler_tick so it should have its + * priority reassessed and choose another task (possibly + * the same one) */ - __clear_bit(idx, rq->dyn_bitmap); - idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1); + task_expired_entitlement(rq, next); + idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); goto retry; } - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); rq->prio_level = idx; - /* - * When the task is chosen it is checked to see if its quota has been - * added to this runqueue level which is only performed once per - * level per major rotation for each running task. - */ - if (next->rotation != rq->prio_rotation) { - /* Task has moved during major rotation */ - task_new_array(next, rq); - if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, rq); - set_task_entitlement(next); - rq_quota(rq, idx) += next->quota; - } else if (!test_bit(USER_PRIO(idx), next->bitmap)) { - /* Task has moved during minor rotation */ - if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, rq); - set_task_entitlement(next); - rq_quota(rq, idx) += next->quota; - } - /* - * next needs to have its prio and array reset here in case the - * values are wrong due to priority rotation. - */ - next->prio = idx; - next->array = array; + next->rotation = rq->prio_rotation; if (next->static_prio < rq->best_static_prio && next->policy != SCHED_BATCH) rq->best_static_prio = next->static_prio; @@ -5027,9 +4866,9 @@ void __cpuinit init_idle(struct task_str struct rq *rq = cpu_rq(cpu); unsigned long flags; - bitmap_zero(idle->bitmap, PRIO_RANGE + 1); - idle->timestamp = sched_clock(); - idle->array = NULL; + bitmap_zero(idle->bitmap, PRIO_RANGE); + idle->timestamp = idle->last_ran = sched_clock(); + idle->array = rq->active; idle->prio = idle->normal_prio = NICE_TO_PRIO(0); idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); @@ -7039,14 +6878,12 @@ void __init sched_init(void) /* delimiter for bitsearch */ __set_bit(MAX_PRIO, array->prio_bitmap); } - for (k = 0; k < PRIO_RANGE; k++) - rq->prio_quota[k] = 0; /* Every added cpu increases the rr_interval */ rr_us += rr_inc; rr_inc /= 2; } - rr_interval = rr_us / 1000 ? : 1; + rr_interval = rr_us / 1000; set_load_weight(&init_task); Index: linux-2.6.20.4-rsdl/include/linux/init_task.h =================================================================== --- linux-2.6.20.4-rsdl.orig/include/linux/init_task.h 2007-03-26 10:00:33.000000000 +1000 +++ linux-2.6.20.4-rsdl/include/linux/init_task.h 2007-03-26 10:05:08.000000000 +1000 @@ -109,8 +109,8 @@ extern struct group_info init_groups; .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ - .quota = HZ, \ + .time_slice = 1000000000, \ + .quota = 1000000000, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ Index: linux-2.6.20.4-rsdl/include/linux/list.h =================================================================== --- linux-2.6.20.4-rsdl.orig/include/linux/list.h 2007-03-26 10:11:14.000000000 +1000 +++ linux-2.6.20.4-rsdl/include/linux/list.h 2007-03-26 10:11:20.000000000 +1000 @@ -332,20 +332,6 @@ static inline void __list_splice(struct at->prev = last; } -static inline void __list_splice_tail(struct list_head *list, - struct list_head *head) -{ - struct list_head *first = list->next; - struct list_head *last = list->prev; - struct list_head *at = head->prev; - - first->prev = at; - at->next = first; - - last->next = head; - head->prev = last; -} - /** * list_splice - join two lists * @list: the new list to add. @@ -358,18 +344,6 @@ static inline void list_splice(struct li } /** - * list_splice_tail - join two lists at one's tail - * @list: the new list to add. - * @head: the place to add it in the first list. - */ -static inline void list_splice_tail(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) - __list_splice_tail(list, head); -} - -/** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. @@ -386,22 +360,6 @@ static inline void list_splice_init(stru } /** - * list_splice_tail_init - join 2 lists at one's tail & reinitialise emptied - * @list: the new list to add. - * @head: the place to add it in the first list. - * - * The list at @list is reinitialised - */ -static inline void list_splice_tail_init(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice_tail(list, head); - INIT_LIST_HEAD(list); - } -} - -/** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in.