Index: linux-2.6.21-rc4-mm1/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.21-rc4-mm1.orig/Documentation/sysctl/kernel.txt 2007-03-21 20:53:50.000000000 +1100 +++ linux-2.6.21-rc4-mm1/Documentation/sysctl/kernel.txt 2007-03-22 11:41:54.000000000 +1100 @@ -43,6 +43,7 @@ show up in /proc/sys/kernel: - printk - real-root-dev ==> Documentation/initrd.txt - reboot-cmd [ SPARC only ] +- rr_interval - rtsig-max - rtsig-nr - sem @@ -288,6 +289,17 @@ rebooting. ??? ============================================================== +rr_interval: + +This is the smallest duration that any cpu process scheduling unit +will run for. Increasing this value can increase throughput of cpu +bound tasks substantially but at the expense of increased latencies +overall. This value is in _ticks_ and the default value chosen depends +on the number of cpus available at scheduler initialisation. Valid +values are from 1-100. + +============================================================== + rtsig-max & rtsig-nr: The file rtsig-max can be used to tune the maximum number Index: linux-2.6.21-rc4-mm1/kernel/sched.c =================================================================== --- linux-2.6.21-rc4-mm1.orig/kernel/sched.c 2007-03-21 20:53:50.000000000 +1100 +++ linux-2.6.21-rc4-mm1/kernel/sched.c 2007-03-26 09:46:40.000000000 +1000 @@ -16,7 +16,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-03-02 Rotating Staircase deadline scheduling policy by Con Kolivas + * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas */ #include @@ -88,13 +88,24 @@ unsigned long long __attribute__((weak)) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) #define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#define JIFFY_NS JIFFIES_TO_NS(1) +#define NS_TO_MS(TIME) ((TIME) / 1000000) +#define MS_TO_NS(TIME) ((TIME) * 1000000) + #define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) /* * This is the time all tasks within the same priority round robin. - * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ. + * Value is in ms and set to a minimum of 8ms. Scales with number of cpus. + * Tunable via /proc interface. */ -static unsigned int rr_interval __read_mostly; +int rr_interval __read_mostly; + #define RR_INTERVAL 8 #define DEF_TIMESLICE (rr_interval * 20) @@ -146,8 +157,10 @@ struct prio_array { DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); /* - * The bitmap of priorities queued; The dynamic bits can have - * false positives. Include 1 bit for delimiter. + * The bitmap of priorities queued for this array. While the expired + * array will never have realtime tasks on it, it is simpler to have + * equal sized bitmaps for a cheap array swap. Include 1 bit for + * delimiter. */ }; @@ -190,17 +203,14 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - long prio_quota[PRIO_RANGE]; - /* - * The quota of ticks the runqueue runs at each dynamic priority - * before cycling to the next priority. - */ - struct prio_array *active, *expired, arrays[2]; unsigned long *dyn_bitmap, *exp_bitmap; - int prio_level; - /* The current dynamic priority level this runqueue is at */ + int prio_level, best_static_prio; + /* + * The current dynamic priority level this runqueue is at, and the + * best static priority queued this major rotation. + */ unsigned long prio_rotation; /* How many times we have rotated the priority queue */ @@ -648,26 +658,13 @@ static inline int task_queued(struct tas return !list_empty(&task->run_list); } -static inline void set_task_entitlement(struct task_struct *p) -{ - __set_bit(USER_PRIO(p->prio), p->bitmap); - p->time_slice = p->quota; -} - -/* - * There is no specific hard accounting. The dynamic bits can have - * false positives. rt_tasks can only be on the active queue. - */ static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) { __set_bit(p->prio, p->array->prio_bitmap); } /* - * Removing from a runqueue. While we don't know with absolute certainty - * where this task really is, the p->array and p->prio are very likely - * so we check that queue to see if we can clear that bit to take some - * load off finding false positives in next_dynamic_task(). + * Removing from a runqueue. */ static void dequeue_task(struct task_struct *p, struct rq *rq) { @@ -684,21 +681,36 @@ static inline void task_new_array(struct { bitmap_zero(p->bitmap, PRIO_RANGE); p->rotation = rq->prio_rotation; + p->time_slice = p->quota; } +/* Find the first slot from the relevant prio_matrix entry */ static inline int first_prio_slot(struct task_struct *p) { return SCHED_PRIO(find_first_zero_bit( prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); } -static inline int next_prio_slot(struct task_struct *p, int prio) +/* + * Find the first unused slot by this task that is also in its prio_matrix + * level. Ensure that the prio_level is not unnecessarily low by checking + * that best_static_prio this major rotation was not a niced task. + * SCHED_BATCH tasks do not perform this check so they do not induce + * latencies in tasks of any nice level. + */ +static inline int next_entitled_slot(struct task_struct *p, struct rq *rq) { DECLARE_BITMAP(tmp, PRIO_RANGE); + int search_prio; + + if (p->static_prio < rq->best_static_prio && p->policy != SCHED_BATCH) + search_prio = MAX_RT_PRIO; + else + search_prio = rq->prio_level; bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE); return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, - USER_PRIO(prio))); + USER_PRIO(search_prio))); } static void queue_expired(struct task_struct *p, struct rq *rq) @@ -707,41 +719,30 @@ static void queue_expired(struct task_st task_new_array(p, rq); p->prio = p->normal_prio = first_prio_slot(p); p->time_slice = p->quota; + p->rotation = rq->prio_rotation; } -#define rq_quota(rq, prio) ((rq)->prio_quota[USER_PRIO(prio)]) - /* - * recalc_task_prio determines what prio a non rt_task will be + * recalc_task_prio determines what priority a non rt_task will be * queued at. If the task has already been running during this runqueue's * major rotation (rq->prio_rotation) then it continues at the same * priority if it has tick entitlement left. If it does not have entitlement * left, it finds the next priority slot according to its nice value that it * has not extracted quota from. If it has not run during this major - * rotation, it starts at its static priority and has its bitmap quota + * rotation, it starts at the next_entitled_slot and has its bitmap quota * cleared. If it does not have any slots left it has all its slots reset and - * is queued on the expired at its static priority. + * is queued on the expired at its first_prio_slot. */ static void recalc_task_prio(struct task_struct *p, struct rq *rq) { struct prio_array *array = rq->active; - int queue_prio, search_prio = MAX_RT_PRIO; - - /* - * SCHED_BATCH tasks never start at better priority than any other - * task that is already running since they are flagged as latency - * insensitive. This means they never cause greater latencies in other - * non SCHED_BATCH tasks of the same nice level, but they still will - * not be exposed to high latencies themselves. - */ - if (unlikely(p->policy == SCHED_BATCH)) - search_prio = rq->prio_level; + int queue_prio; if (p->rotation == rq->prio_rotation) { if (p->array == array) { - if (p->time_slice && rq_quota(rq, p->prio)) + if (p->time_slice > 0) return; - search_prio = p->prio; + p->time_slice = p->quota; } else if (p->array == rq->expired) { queue_expired(p, rq); return; @@ -750,22 +751,19 @@ static void recalc_task_prio(struct task } else task_new_array(p, rq); - queue_prio = next_prio_slot(p, search_prio); + queue_prio = next_entitled_slot(p, rq); if (queue_prio >= MAX_PRIO) { queue_expired(p, rq); return; } - rq_quota(rq, queue_prio) += p->quota; p->prio = p->normal_prio = queue_prio; p->array = array; - set_task_entitlement(p); + __set_bit(USER_PRIO(p->prio), p->bitmap); } /* * Adding to a runqueue. The dynamic priority queue that it is added to is - * determined by the priority rotation of the runqueue it is being added to - * and the quota still available in the task in p->bitmap and p->time_slice - * (see recalc_task_prio above). + * determined by recalc_task_prio() above. */ static inline void __enqueue_task(struct task_struct *p, struct rq *rq) { @@ -802,7 +800,7 @@ static void requeue_task(struct task_str list_move_tail(&p->run_list, p->array->queue + p->prio); if (!rt_task(p)) { if (list_empty(old_array->queue + old_prio)) - __clear_bit(old_prio, p->array->prio_bitmap); + __clear_bit(old_prio, old_array->prio_bitmap); set_dynamic_bit(p, rq); } } @@ -820,13 +818,14 @@ static void requeue_task(struct task_str * task_timeslice - the total duration a task can run during one major * rotation. */ -static inline unsigned int task_timeslice(struct task_struct *p) +static inline int task_timeslice(struct task_struct *p) { - unsigned int slice, rr; + int slice, rr; slice = rr = p->quota; if (!rt_task(p)) slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr; + slice = NS_TO_JIFFIES(slice) ? : 1; return slice; } @@ -907,7 +906,7 @@ static inline int normal_prio(struct tas if (has_rt_policy(p)) return MAX_RT_PRIO-1 - p->rt_priority; /* Other tasks all have normal_prio set in recalc_task_prio */ - if (likely(p->prio >= MAX_RT_PRIO)) + if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) return p->prio; else return p->static_prio; @@ -933,22 +932,24 @@ static int effective_prio(struct task_st } /* - * All tasks have quotas based on rr_interval. From nice 0 to 19 they are - * all equal to it and below zero they get exponentially larger making their - * effective quota significantly larger. rt tasks all get rr_interval. - * ie nice -6..19 = rr_interval. nice -10 = 2.5 * rr_interval - * nice -20 = 10 * rr_interval. This makes the ratios between -20 and 0 - * similar to the ratios between 0 and +19. + * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. + * From nice 1 to 19 they are smaller than it only if they are at least one + * tick still. Below nice 0 they get progressively larger. + * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval + * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. */ static unsigned int rr_quota(struct task_struct *p) { - int neg_nice = -TASK_NICE(p), rr = rr_interval; + int nice = TASK_NICE(p), rr = rr_interval; - if (neg_nice > 6 && !rt_task(p)) { - rr *= neg_nice * neg_nice; - rr /= 40; + if (!rt_task(p)) { + if (nice < -6) { + rr *= nice * nice; + rr /= 40; + } else if (nice > 0 && (rr * HZ / 1000 / 2) > 0) + rr /= 2; } - return rr; + return MS_TO_NS(rr); } /* @@ -1630,7 +1631,9 @@ void fastcall sched_fork(struct task_str * resulting in more scheduling fairness. */ local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + if (unlikely(p->time_slice < 2)) + p->time_slice = 2; + p->time_slice = current->time_slice >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. @@ -1638,15 +1641,6 @@ void fastcall sched_fork(struct task_str p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); - if (!current->time_slice) { - /* - * This case happens when the parent has only a single jiffy - * left from its timeslice. Taking the runqueue lock is not - * a problem. - */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); - } local_irq_enable(); out: put_cpu(); @@ -1720,14 +1714,16 @@ void fastcall wake_up_new_task(struct ta */ void fastcall sched_exit(struct task_struct *p) { + struct task_struct *parent; unsigned long flags; struct rq *rq; - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > p->quota)) - p->parent->time_slice = p->quota; + parent = p->parent; + rq = task_rq_lock(parent, &flags); + if (p->first_time_slice && task_cpu(p) == task_cpu(parent)) { + parent->time_slice += p->time_slice; + if (unlikely(parent->time_slice > parent->quota)) + parent->time_slice = parent->quota; } task_rq_unlock(rq, &flags); } @@ -2057,25 +2053,55 @@ void sched_exec(void) } /* + * This is a unique version of enqueue_task for the SMP case where a task + * has just been moved across runqueues. It uses the information from the + * old runqueue to help it make a decision much like recalc_task_prio. As + * the new runqueue is almost certainly at a different prio_level than the + * src_rq it is cheapest just to pick the next entitled slot. + */ +static inline void enqueue_pulled_task(struct rq *src_rq, struct rq *rq, + struct task_struct *p) +{ + int queue_prio; + + p->array = rq->active; + if (!rt_task(p)) { + if (p->rotation == src_rq->prio_rotation) { + if (p->array == src_rq->expired) { + queue_expired(p, rq); + goto out_queue; + } + if (p->time_slice < 0) + task_new_array(p, rq); + } else + task_new_array(p, rq); + } + queue_prio = next_entitled_slot(p, rq); + if (queue_prio >= MAX_PRIO) { + queue_expired(p, rq); + goto out_queue; + } + p->prio = queue_prio; +out_queue: + p->normal_prio = p->prio; + p->rotation = rq->prio_rotation; + sched_info_queued(p); + set_dynamic_bit(p, rq); + list_add_tail(&p->run_list, p->array->queue + p->prio); +} + +/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct prio_array *src_array, - struct task_struct *p, struct rq *this_rq, - int this_cpu) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { dequeue_task(p, src_rq); dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); - - /* - * If this task has already been running on src_rq this priority - * cycle, make the new runqueue think it has been on its cycle - */ - if (p->rotation == src_rq->prio_rotation) - p->rotation = this_rq->prio_rotation; - enqueue_task(p, this_rq); + enqueue_pulled_task(src_rq, this_rq, p); p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + this_rq->most_recent_timestamp; try_preempt(p, this_rq); @@ -2220,7 +2246,7 @@ skip_queue: goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, this_cpu); + pull_task(busiest, tmp, this_rq, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -3181,8 +3207,63 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - p->last_ran; - p->last_ran = rq->most_recent_timestamp = now; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t time_diff; + + /* Sanity check. It should never go backwards or ruin accounting */ + if (unlikely(now < p->last_ran)) + goto out_set; + /* All the userspace visible cpu accounting is done here */ + time_diff = now - p->last_ran; + p->sched_time += time_diff; + if (p != rq->idle) { + cputime_t utime_diff = time_diff; + + if (TASK_NICE(p) > 0) { + cpustat->nice_ns = cputime64_add(cpustat->nice_ns, + time_diff); + if (cpustat->nice_ns > JIFFY_NS) { + cpustat->nice_ns = + cputime64_sub(cpustat->nice_ns, + JIFFY_NS); + cpustat->nice = + cputime64_add(cpustat->nice, 1); + } + } else { + cpustat->user_ns = cputime64_add(cpustat->user_ns, + time_diff); + if (cpustat->user_ns > JIFFY_NS) { + cpustat->user_ns = + cputime64_sub(cpustat->user_ns, + JIFFY_NS); + cpustat ->user = + cputime64_add(cpustat->user, 1); + } + } + p->utime_ns = cputime_add(p->utime_ns, utime_diff); + if (p->utime_ns > JIFFY_NS) { + p->utime_ns = cputime_sub(p->utime_ns, JIFFY_NS); + p->utime = cputime_add(p->utime, + jiffies_to_cputime(1)); + } + /* cpu scheduler quota accounting is performed here */ + if (p->policy != SCHED_FIFO) + p->time_slice -= time_diff; + + } else { + cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff); + if (cpustat->idle_ns > JIFFY_NS) { + cpustat->idle_ns = cputime64_sub(cpustat->idle_ns, + JIFFY_NS); + cpustat->idle = cputime64_add(cpustat->idle, 1); + } + } +out_set: + /* + * We still need to set these values even if the clock appeared to + * go backwards in case _this_ is the correct timestamp. + */ + rq->most_recent_timestamp = p->last_ran = now; } /* @@ -3247,8 +3328,6 @@ void account_system_time(struct task_str cpustat->system = cputime64_add(cpustat->system, tmp); else if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); /* Account for system time used */ acct_update_integrals(p); } @@ -3283,7 +3362,6 @@ static void task_expired_entitlement(str struct prio_array *old_array; int old_prio; - set_tsk_need_resched(p); if (unlikely(p->first_time_slice)) p->first_time_slice = 0; if (rt_task(p)) { @@ -3297,114 +3375,24 @@ static void task_expired_entitlement(str requeue_task(p, rq, old_array, old_prio); } -/* - * A major priority rotation occurs when all priority quotas for this array - * have been exhausted. - */ -static inline void major_prio_rotation(struct rq *rq) -{ - struct prio_array *new_array = rq->expired; - - rq->expired = rq->active; - rq->active = new_array; - rq->exp_bitmap = rq->expired->prio_bitmap; - rq->dyn_bitmap = rq->active->prio_bitmap; - rq->prio_rotation++; -} - -/* - * This is the heart of the virtual deadline priority management. - * - * We have used up the quota allocated to this priority level so we rotate - * the prio_level of the runqueue to the next lowest priority. We merge any - * remaining tasks at this level current_queue with the next priority and - * reset this level's queue. MAX_PRIO - 1 is a special case where we perform - * a major rotation. - */ -static inline void rotate_runqueue_priority(struct rq *rq) -{ - int new_prio_level; - struct prio_array *array; - - /* - * Make sure we don't have tasks still on the active array that - * haven't run due to not preempting a lower priority task. This can - * happen on list merging or smp balancing. - */ - if (unlikely(sched_find_first_bit(rq->dyn_bitmap) < rq->prio_level)) - return; - - array = rq->active; - if (rq->prio_level > MAX_PRIO - 2) { - /* Major rotation required */ - struct prio_array *new_queue = rq->expired; - - /* - * On a major rotation we move everything remaining to best - * priority on the new array. The priority matrix bitmap will - * ensure tasks only get the slots each static priority - * deserves. - */ - new_prio_level = MAX_RT_PRIO; - if (!list_empty(array->queue + rq->prio_level)) { - list_splice_tail_init(array->queue + rq->prio_level, - new_queue->queue + new_prio_level); - } - memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota)); - major_prio_rotation(rq); - } else { - /* Minor rotation */ - new_prio_level = rq->prio_level + 1; - __clear_bit(rq->prio_level, rq->dyn_bitmap); - if (!list_empty(array->queue + rq->prio_level)) { - list_splice_tail_init(array->queue + rq->prio_level, - array->queue + new_prio_level); - __set_bit(new_prio_level, rq->dyn_bitmap); - } - rq_quota(rq, rq->prio_level) = 0; - } - rq->prio_level = new_prio_level; - /* - * As we are merging to a prio_level that may not have anything in - * its quota we add 1 to ensure the tasks get to run in schedule() to - * add their quota to it. - */ - rq_quota(rq, new_prio_level) += 1; -} - static void task_running_tick(struct rq *rq, struct task_struct *p) { - if (unlikely(!task_queued(p))) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); - return; - } /* SCHED_FIFO tasks never run out of timeslice. */ if (unlikely(p->policy == SCHED_FIFO)) return; + if (p->time_slice > 0) + return; spin_lock(&rq->lock); - /* - * Accounting is performed by both the task and the runqueue. This - * allows frequently sleeping tasks to get their proper quota of - * cpu as the runqueue will have their quota still available at - * the appropriate priority level. It also means frequently waking - * tasks that might miss the scheduler_tick() will get forced down - * priority regardless. - */ - if (!--p->time_slice) - task_expired_entitlement(rq, p); - /* - * We only employ the deadline mechanism if we run over the quota. - * It allows aliasing problems around the scheduler_tick to be - * less harmful. - */ - if (!rt_task(p) && --rq_quota(rq, rq->prio_level) < 0) { - if (unlikely(p->first_time_slice)) - p->first_time_slice = 0; - rotate_runqueue_priority(rq); + if (unlikely(!task_queued(p))) { + /* Task has expired but was not scheduled off yet */ set_tsk_need_resched(p); + goto out_unlock; } + /* p->time_slice <= 0 */ + task_expired_entitlement(rq, p); + set_tsk_need_resched(p); +out_unlock: spin_unlock(&rq->lock); } @@ -3469,87 +3457,45 @@ EXPORT_SYMBOL(sub_preempt_count); #endif -/* Is a dynamic_prio part of the allocated slots for this static_prio */ -static inline int entitled_slot(int static_prio, int dynamic_prio) -{ - return !test_bit(USER_PRIO(dynamic_prio), - prio_matrix[USER_PRIO(static_prio)]); -} - /* - * If a task is queued at a priority that isn't from its bitmap we exchange - * by setting one of the entitlement bits. - */ -static inline void exchange_slot(struct task_struct *p, int prio) -{ - int slot = next_prio_slot(p, prio); - - if (slot < MAX_PRIO) - __set_bit(USER_PRIO(slot), p->bitmap); -} - -/* - * next_dynamic_task finds the next suitable dynamic task. As the dyn_bitmap - * contains all the active and expired dynamic tasks sequentially we only - * need to do one bitmap lookup. + * next_dynamic_task finds the next suitable dynamic task. */ static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) { struct task_struct *next; struct list_head *queue; struct prio_array *array = rq->active; - int expirations = 0; retry: if (idx >= MAX_PRIO) { - BUG_ON(++expirations > 1); - /* - * We have selected a bit from the expired range so there are - * no more tasks in the active array. - */ - major_prio_rotation(rq); - array = rq->active; + /* There are no more tasks in the active array. Swap arrays */ + array = rq->expired; + rq->expired = rq->active; + rq->active = array; + rq->exp_bitmap = rq->expired->prio_bitmap; + rq->dyn_bitmap = rq->active->prio_bitmap; + rq->best_static_prio = MAX_PRIO - 1; + rq->prio_rotation++; idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); } - if (unlikely(list_empty(array->queue + idx))) { + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); + if (unlikely(next->time_slice < 0)) { /* - * This can happen because they are not always cleared on - * dequeue_task since they may have been dequeued while - * waiting on a runqueue and a rotation has occurred in the - * interim. A very rare occurrence. + * Unlucky enough that this task ran out of time_slice + * before it hit a scheduler_tick so it should have its + * priority reassessed and choose another task (possibly + * the same one) */ - __clear_bit(idx, rq->dyn_bitmap); - idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, idx + 1); + task_expired_entitlement(rq, next); + idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); goto retry; } - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); - /* - * When the task is chosen it is checked to see if its quota has been - * added to this runqueue level which is only performed once per - * level per major rotation for each running task. - */ - if (next->rotation != rq->prio_rotation) { - /* Task has moved during major rotation */ - task_new_array(next, rq); - if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, idx); - set_task_entitlement(next); - rq_quota(rq, idx) += next->quota; - } else if (!test_bit(USER_PRIO(idx), next->bitmap)) { - /* Task has moved during minor rotation */ - if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, idx); - set_task_entitlement(next); - rq_quota(rq, idx) += next->quota; - } rq->prio_level = idx; - /* - * next needs to have its prio and array reset here in case the - * values are wrong due to priority rotation. - */ - next->prio = idx; - next->array = array; + next->rotation = rq->prio_rotation; + if (next->static_prio < rq->best_static_prio && + next->policy != SCHED_BATCH) + rq->best_static_prio = next->static_prio; return next; } @@ -3632,8 +3578,12 @@ need_resched_nonpreemptible: next = list_entry(queue->next, struct task_struct, run_list); } switch_tasks: - if (next == rq->idle) + if (next == rq->idle) { + rq->best_static_prio = MAX_PRIO - 1; + rq->prio_level = MAX_RT_PRIO; + rq->prio_rotation++; schedstat_inc(rq, sched_goidle); + } prefetch(next); prefetch_stack(next); clear_tsk_need_resched(prev); @@ -4648,8 +4598,9 @@ asmlinkage long sys_sched_getaffinity(pi * sys_sched_yield - yield the current processor to other threads. * * This function yields the current CPU by moving the calling thread - * to the end of its current priority queue. If there are no other - * threads running on this cpu this function will return. + * to the expired array if SCHED_NORMAL or the end of its current priority + * queue if a realtime task. If there are no other threads running on this + * cpu this function will return. */ asmlinkage long sys_sched_yield(void) { @@ -4659,8 +4610,15 @@ asmlinkage long sys_sched_yield(void) schedstat_inc(rq, yld_cnt); if (rq->nr_running == 1) schedstat_inc(rq, yld_both_empty); - else - list_move_tail(&p->run_list, p->array->queue + p->prio); + else { + struct prio_array *old_array = p->array; + int old_prio = p->prio; + + /* p->prio will be updated in requeue_task via queue_expired */ + if (!rt_task(p)) + p->array = rq->expired; + requeue_task(p, rq, old_array, old_prio); + } /* * Since we are going to call schedule() anyway, there's @@ -4999,9 +4957,9 @@ void __cpuinit init_idle(struct task_str struct rq *rq = cpu_rq(cpu); unsigned long flags; - bitmap_zero(idle->bitmap, PRIO_RANGE + 1); - idle->timestamp = sched_clock(); - idle->array = NULL; + bitmap_zero(idle->bitmap, PRIO_RANGE); + idle->timestamp = idle->last_ran = sched_clock(); + idle->array = rq->active; idle->prio = idle->normal_prio = NICE_TO_PRIO(0); idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); @@ -7083,6 +7041,7 @@ void __init sched_init(void) lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; rq->prio_rotation = 0; + rq->best_static_prio = MAX_PRIO - 1; rq->prio_level = MAX_RT_PRIO; rq->active = rq->arrays; rq->expired = rq->arrays + 1; @@ -7110,15 +7069,13 @@ void __init sched_init(void) /* delimiter for bitsearch */ __set_bit(MAX_PRIO, array->prio_bitmap); } - for (k = 0; k < PRIO_RANGE; k++) - rq->prio_quota[k] = 0; highest_cpu = i; /* Every added cpu increases the rr_interval */ rr_us += rr_inc; rr_inc /= 2; } - rr_interval = rr_us / 1000 ? : 1; + rr_interval = rr_us / 1000; set_load_weight(&init_task); Index: linux-2.6.21-rc4-mm1/kernel/sysctl.c =================================================================== --- linux-2.6.21-rc4-mm1.orig/kernel/sysctl.c 2007-03-21 20:53:50.000000000 +1100 +++ linux-2.6.21-rc4-mm1/kernel/sysctl.c 2007-03-22 11:41:54.000000000 +1100 @@ -79,6 +79,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int print_fatal_signals; +extern int rr_interval; #if defined(CONFIG_ADAPTIVE_READAHEAD) extern int readahead_ratio; @@ -167,6 +168,13 @@ int sysctl_legacy_va_layout; #endif +/* Constants for minimum and maximum testing in vm_table. + We use these as one-element integer vectors. */ +static int __read_mostly zero; +static int __read_mostly one = 1; +static int __read_mostly one_hundred = 100; + + /* The default sysctl tables: */ static ctl_table root_table[] = { @@ -515,6 +523,17 @@ static ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rr_interval", + .data = &rr_interval, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &one_hundred, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC, @@ -631,12 +650,6 @@ static ctl_table kern_table[] = { { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY, Index: linux-2.6.21-rc4-mm1/include/linux/kernel_stat.h =================================================================== --- linux-2.6.21-rc4-mm1.orig/include/linux/kernel_stat.h 2007-03-26 09:04:10.000000000 +1000 +++ linux-2.6.21-rc4-mm1/include/linux/kernel_stat.h 2007-03-26 09:04:54.000000000 +1000 @@ -16,11 +16,14 @@ struct cpu_usage_stat { cputime64_t user; + cputime64_t user_ns; cputime64_t nice; + cputime64_t nice_ns; cputime64_t system; cputime64_t softirq; cputime64_t irq; cputime64_t idle; + cputime64_t idle_ns; cputime64_t iowait; cputime64_t steal; }; Index: linux-2.6.21-rc4-mm1/include/linux/sched.h =================================================================== --- linux-2.6.21-rc4-mm1.orig/include/linux/sched.h 2007-03-26 09:04:10.000000000 +1000 +++ linux-2.6.21-rc4-mm1/include/linux/sched.h 2007-03-26 09:36:43.000000000 +1000 @@ -853,18 +853,15 @@ struct task_struct { unsigned int policy; cpumask_t cpus_allowed; - unsigned int time_slice; + int time_slice; /* * How much this task is entitled to run at the current priority * before being requeued at a lower priority. */ + int quota; + /* How much this task receives at each priority level */ unsigned int first_time_slice; /* Is this the very first time_slice this task has ever run. */ - unsigned int quota; - /* - * How much this task contributes to the current priority queue - * length - */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -911,7 +908,7 @@ struct task_struct { int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ unsigned int rt_priority; - cputime_t utime, stime; + cputime_t utime, utime_ns, stime; unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ Index: linux-2.6.21-rc4-mm1/kernel/timer.c =================================================================== --- linux-2.6.21-rc4-mm1.orig/kernel/timer.c 2007-03-26 09:04:10.000000000 +1000 +++ linux-2.6.21-rc4-mm1/kernel/timer.c 2007-03-26 09:04:54.000000000 +1000 @@ -1196,10 +1196,9 @@ void update_process_times(int user_tick) int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ - if (user_tick) - account_user_time(p, jiffies_to_cputime(1)); - else + if (!user_tick) account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); + /* User time is accounted for in update_cpu_clock in sched.c */ run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); Index: linux-2.6.21-rc4-mm1/Documentation/cpu-load.txt =================================================================== --- linux-2.6.21-rc4-mm1.orig/Documentation/cpu-load.txt 2007-03-26 09:04:10.000000000 +1000 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,113 +0,0 @@ -CPU load --------- - -Linux exports various bits of information via `/proc/stat' and -`/proc/uptime' that userland tools, such as top(1), use to calculate -the average time system spent in a particular state, for example: - - $ iostat - Linux 2.6.18.3-exp (linmac) 02/20/2007 - - avg-cpu: %user %nice %system %iowait %steal %idle - 10.01 0.00 2.92 5.44 0.00 81.63 - - ... - -Here the system thinks that over the default sampling period the -system spent 10.01% of the time doing work in user space, 2.92% in the -kernel, and was overall 81.63% of the time idle. - -In most cases the `/proc/stat' information reflects the reality quite -closely, however due to the nature of how/when the kernel collects -this data sometimes it can not be trusted at all. - -So how is this information collected? Whenever timer interrupt is -signalled the kernel looks what kind of task was running at this -moment and increments the counter that corresponds to this tasks -kind/state. The problem with this is that the system could have -switched between various states multiple times between two timer -interrupts yet the counter is incremented only for the last state. - - -Example -------- - -If we imagine the system with one task that periodically burns cycles -in the following manner: - - time line between two timer interrupts -|--------------------------------------| - ^ ^ - |_ something begins working | - |_ something goes to sleep - (only to be awaken quite soon) - -In the above situation the system will be 0% loaded according to the -`/proc/stat' (since the timer interrupt will always happen when the -system is executing the idle handler), but in reality the load is -closer to 99%. - -One can imagine many more situations where this behavior of the kernel -will lead to quite erratic information inside `/proc/stat'. - - -/* gcc -o hog smallhog.c */ -#include -#include -#include -#include -#define HIST 10 - -static volatile sig_atomic_t stop; - -static void sighandler (int signr) -{ - (void) signr; - stop = 1; -} -static unsigned long hog (unsigned long niters) -{ - stop = 0; - while (!stop && --niters); - return niters; -} -int main (void) -{ - int i; - struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, - .it_value = { .tv_sec = 0, .tv_usec = 1 } }; - sigset_t set; - unsigned long v[HIST]; - double tmp = 0.0; - unsigned long n; - signal (SIGALRM, &sighandler); - setitimer (ITIMER_REAL, &it, NULL); - - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) tmp += v[i]; - tmp /= HIST; - n = tmp - (tmp / 3.0); - - sigemptyset (&set); - sigaddset (&set, SIGALRM); - - for (;;) { - hog (n); - sigwait (&set, &i); - } - return 0; -} - - -References ----------- - -http://lkml.org/lkml/2007/2/12/6 -Documentation/filesystems/proc.txt (1.8) - - -Thanks ------- - -Con Kolivas, Pavel Machek Index: linux-2.6.21-rc4-mm1/include/linux/init_task.h =================================================================== --- linux-2.6.21-rc4-mm1.orig/include/linux/init_task.h 2007-03-26 09:30:22.000000000 +1000 +++ linux-2.6.21-rc4-mm1/include/linux/init_task.h 2007-03-26 09:36:43.000000000 +1000 @@ -131,8 +131,8 @@ extern struct group_info init_groups; .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ - .quota = HZ, \ + .time_slice = 1000000000, \ + .quota = 1000000000, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .parent = &tsk, \ .children = LIST_HEAD_INIT(tsk.children), \