Index: linux-2.6.21-rc4-mm1/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.21-rc4-mm1.orig/Documentation/sysctl/kernel.txt 2007-03-21 20:53:50.000000000 +1100 +++ linux-2.6.21-rc4-mm1/Documentation/sysctl/kernel.txt 2007-03-22 11:41:54.000000000 +1100 @@ -43,6 +43,7 @@ show up in /proc/sys/kernel: - printk - real-root-dev ==> Documentation/initrd.txt - reboot-cmd [ SPARC only ] +- rr_interval - rtsig-max - rtsig-nr - sem @@ -288,6 +289,17 @@ rebooting. ??? ============================================================== +rr_interval: + +This is the smallest duration that any cpu process scheduling unit +will run for. Increasing this value can increase throughput of cpu +bound tasks substantially but at the expense of increased latencies +overall. This value is in _ticks_ and the default value chosen depends +on the number of cpus available at scheduler initialisation. Valid +values are from 1-100. + +============================================================== + rtsig-max & rtsig-nr: The file rtsig-max can be used to tune the maximum number Index: linux-2.6.21-rc4-mm1/kernel/sched.c =================================================================== --- linux-2.6.21-rc4-mm1.orig/kernel/sched.c 2007-03-21 20:53:50.000000000 +1100 +++ linux-2.6.21-rc4-mm1/kernel/sched.c 2007-03-23 17:28:19.000000000 +1100 @@ -93,8 +93,10 @@ unsigned long long __attribute__((weak)) /* * This is the time all tasks within the same priority round robin. * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ. + * Tunable via /proc interface. */ -static unsigned int rr_interval __read_mostly; +int rr_interval __read_mostly; + #define RR_INTERVAL 8 #define DEF_TIMESLICE (rr_interval * 20) @@ -199,8 +201,11 @@ struct rq { struct prio_array *active, *expired, arrays[2]; unsigned long *dyn_bitmap, *exp_bitmap; - int prio_level; - /* The current dynamic priority level this runqueue is at */ + int prio_level, best_static_prio; + /* + * The current dynamic priority level this runqueue is at, and the + * best static priority queued this major rotation. + */ unsigned long prio_rotation; /* How many times we have rotated the priority queue */ @@ -686,19 +691,40 @@ static inline void task_new_array(struct p->rotation = rq->prio_rotation; } +/* Find the first slot from the relevant prio_matrix entry */ static inline int first_prio_slot(struct task_struct *p) { return SCHED_PRIO(find_first_zero_bit( prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); } -static inline int next_prio_slot(struct task_struct *p, int prio) +/* Is a dynamic_prio part of the allocated slots for this static_prio */ +static inline int entitled_slot(int static_prio, int dynamic_prio) +{ + return !test_bit(USER_PRIO(dynamic_prio), + prio_matrix[USER_PRIO(static_prio)]); +} + +/* + * Find the first unused slot by this task that is also in its prio_matrix + * level. Ensure that the prio_level is not unnecessarily low by checking + * that best_static_prio this major rotation was not a niced task. + * SCHED_BATCH tasks do not perform this check so they do not induce + * latencies in tasks of any nice level. + */ +static inline int next_entitled_slot(struct task_struct *p, struct rq *rq) { DECLARE_BITMAP(tmp, PRIO_RANGE); + int search_prio; + + if (p->static_prio < rq->best_static_prio && p->policy != SCHED_BATCH) + search_prio = MAX_RT_PRIO; + else + search_prio = rq->prio_level; bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE); return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, - USER_PRIO(prio))); + USER_PRIO(search_prio))); } static void queue_expired(struct task_struct *p, struct rq *rq) @@ -725,23 +751,12 @@ static void queue_expired(struct task_st static void recalc_task_prio(struct task_struct *p, struct rq *rq) { struct prio_array *array = rq->active; - int queue_prio, search_prio = MAX_RT_PRIO; - - /* - * SCHED_BATCH tasks never start at better priority than any other - * task that is already running since they are flagged as latency - * insensitive. This means they never cause greater latencies in other - * non SCHED_BATCH tasks of the same nice level, but they still will - * not be exposed to high latencies themselves. - */ - if (unlikely(p->policy == SCHED_BATCH)) - search_prio = rq->prio_level; + int queue_prio; if (p->rotation == rq->prio_rotation) { if (p->array == array) { if (p->time_slice && rq_quota(rq, p->prio)) return; - search_prio = p->prio; } else if (p->array == rq->expired) { queue_expired(p, rq); return; @@ -750,7 +765,7 @@ static void recalc_task_prio(struct task } else task_new_array(p, rq); - queue_prio = next_prio_slot(p, search_prio); + queue_prio = next_entitled_slot(p, rq); if (queue_prio >= MAX_PRIO) { queue_expired(p, rq); return; @@ -802,7 +817,7 @@ static void requeue_task(struct task_str list_move_tail(&p->run_list, p->array->queue + p->prio); if (!rt_task(p)) { if (list_empty(old_array->queue + old_prio)) - __clear_bit(old_prio, p->array->prio_bitmap); + __clear_bit(old_prio, old_array->prio_bitmap); set_dynamic_bit(p, rq); } } @@ -907,7 +922,7 @@ static inline int normal_prio(struct tas if (has_rt_policy(p)) return MAX_RT_PRIO-1 - p->rt_priority; /* Other tasks all have normal_prio set in recalc_task_prio */ - if (likely(p->prio >= MAX_RT_PRIO)) + if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) return p->prio; else return p->static_prio; @@ -942,10 +957,10 @@ static int effective_prio(struct task_st */ static unsigned int rr_quota(struct task_struct *p) { - int neg_nice = -TASK_NICE(p), rr = rr_interval; + int nice = TASK_NICE(p), rr = rr_interval; - if (neg_nice > 6 && !rt_task(p)) { - rr *= neg_nice * neg_nice; + if (nice < -6 && !rt_task(p)) { + rr *= nice * nice; rr /= 40; } return rr; @@ -1583,7 +1598,7 @@ int fastcall wake_up_state(struct task_s return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); +static void task_running_tick(struct rq *rq, struct task_struct *p, int tick); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1645,7 +1660,7 @@ void fastcall sched_fork(struct task_str * a problem. */ current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); + task_running_tick(cpu_rq(cpu), current, 0); } local_irq_enable(); out: @@ -1720,14 +1735,16 @@ void fastcall wake_up_new_task(struct ta */ void fastcall sched_exit(struct task_struct *p) { + struct task_struct *parent; unsigned long flags; struct rq *rq; - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > p->quota)) - p->parent->time_slice = p->quota; + parent = p->parent; + rq = task_rq_lock(parent, &flags); + if (p->first_time_slice && task_cpu(p) == task_cpu(parent)) { + parent->time_slice += p->time_slice; + if (unlikely(parent->time_slice > parent->quota)) + parent->time_slice = parent->quota; } task_rq_unlock(rq, &flags); } @@ -2057,25 +2074,54 @@ void sched_exec(void) } /* + * This is a unique version of enqueue_task for the SMP case where a task + * has just been moved across runqueues. It uses the information from the + * old runqueue to help it make a decision much like recalc_task_prio. As + * the new runqueue is almost certainly at a different prio_level than the + * src_rq it is cheapest just to pick the next entitled slot. + */ +static inline void enqueue_pulled_task(struct rq *src_rq, struct rq *rq, + struct task_struct *p) +{ + int queue_prio; + + p->array = rq->active; + if (!rt_task(p)) { + if (p->rotation == src_rq->prio_rotation) { + if (p->array == src_rq->expired) { + queue_expired(p, rq); + goto out_queue; + } + } else + task_new_array(p, rq); + } + queue_prio = next_entitled_slot(p, rq); + if (queue_prio >= MAX_PRIO) { + queue_expired(p, rq); + goto out_queue; + } + rq_quota(rq, queue_prio) += p->quota; + p->prio = queue_prio; +out_queue: + p->normal_prio = p->prio; + p->rotation = rq->prio_rotation; + sched_info_queued(p); + set_dynamic_bit(p, rq); + list_add_tail(&p->run_list, p->array->queue + p->prio); +} + +/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct prio_array *src_array, - struct task_struct *p, struct rq *this_rq, - int this_cpu) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { dequeue_task(p, src_rq); dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); - - /* - * If this task has already been running on src_rq this priority - * cycle, make the new runqueue think it has been on its cycle - */ - if (p->rotation == src_rq->prio_rotation) - p->rotation = this_rq->prio_rotation; - enqueue_task(p, this_rq); + enqueue_pulled_task(src_rq, this_rq, p); p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + this_rq->most_recent_timestamp; try_preempt(p, this_rq); @@ -2220,7 +2266,7 @@ skip_queue: goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, this_cpu); + pull_task(busiest, tmp, this_rq, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -3309,6 +3355,7 @@ static inline void major_prio_rotation(s rq->active = new_array; rq->exp_bitmap = rq->expired->prio_bitmap; rq->dyn_bitmap = rq->active->prio_bitmap; + rq->best_static_prio = MAX_PRIO - 1; rq->prio_rotation++; } @@ -3372,7 +3419,7 @@ static inline void rotate_runqueue_prior rq_quota(rq, new_prio_level) += 1; } -static void task_running_tick(struct rq *rq, struct task_struct *p) +static void task_running_tick(struct rq *rq, struct task_struct *p, int tick) { if (unlikely(!task_queued(p))) { /* Task has expired but was not scheduled yet */ @@ -3395,6 +3442,13 @@ static void task_running_tick(struct rq if (!--p->time_slice) task_expired_entitlement(rq, p); /* + * If we're actually calling this function not in a scheduler_tick + * we are doing so to fix accounting across fork and should not be + * deducting anything from rq_quota. + */ + if (!tick) + goto out_unlock; + /* * We only employ the deadline mechanism if we run over the quota. * It allows aliasing problems around the scheduler_tick to be * less harmful. @@ -3405,6 +3459,7 @@ static void task_running_tick(struct rq rotate_runqueue_priority(rq); set_tsk_need_resched(p); } +out_unlock: spin_unlock(&rq->lock); } @@ -3423,7 +3478,7 @@ void scheduler_tick(void) update_cpu_clock(p, rq, now); if (!idle_at_tick) - task_running_tick(rq, p); + task_running_tick(rq, p, 1); #ifdef CONFIG_SMP update_load(rq); rq->idle_at_tick = idle_at_tick; @@ -3469,20 +3524,13 @@ EXPORT_SYMBOL(sub_preempt_count); #endif -/* Is a dynamic_prio part of the allocated slots for this static_prio */ -static inline int entitled_slot(int static_prio, int dynamic_prio) -{ - return !test_bit(USER_PRIO(dynamic_prio), - prio_matrix[USER_PRIO(static_prio)]); -} - /* * If a task is queued at a priority that isn't from its bitmap we exchange * by setting one of the entitlement bits. */ -static inline void exchange_slot(struct task_struct *p, int prio) +static inline void exchange_slot(struct task_struct *p, struct rq *rq) { - int slot = next_prio_slot(p, prio); + int slot = next_entitled_slot(p, rq); if (slot < MAX_PRIO) __set_bit(USER_PRIO(slot), p->bitmap); @@ -3524,6 +3572,7 @@ retry: } queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); + rq->prio_level = idx; /* * When the task is chosen it is checked to see if its quota has been * added to this runqueue level which is only performed once per @@ -3533,23 +3582,25 @@ retry: /* Task has moved during major rotation */ task_new_array(next, rq); if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, idx); + exchange_slot(next, rq); set_task_entitlement(next); rq_quota(rq, idx) += next->quota; } else if (!test_bit(USER_PRIO(idx), next->bitmap)) { /* Task has moved during minor rotation */ if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, idx); + exchange_slot(next, rq); set_task_entitlement(next); rq_quota(rq, idx) += next->quota; } - rq->prio_level = idx; /* * next needs to have its prio and array reset here in case the * values are wrong due to priority rotation. */ next->prio = idx; next->array = array; + if (next->static_prio < rq->best_static_prio && + next->policy != SCHED_BATCH) + rq->best_static_prio = next->static_prio; return next; } @@ -3632,8 +3683,12 @@ need_resched_nonpreemptible: next = list_entry(queue->next, struct task_struct, run_list); } switch_tasks: - if (next == rq->idle) + if (next == rq->idle) { + rq->best_static_prio = MAX_PRIO - 1; + rq->prio_level = MAX_RT_PRIO; + rq->prio_rotation++; schedstat_inc(rq, sched_goidle); + } prefetch(next); prefetch_stack(next); clear_tsk_need_resched(prev); @@ -4648,8 +4703,9 @@ asmlinkage long sys_sched_getaffinity(pi * sys_sched_yield - yield the current processor to other threads. * * This function yields the current CPU by moving the calling thread - * to the end of its current priority queue. If there are no other - * threads running on this cpu this function will return. + * to the expired array if SCHED_NORMAL or the end of its current priority + * queue if a realtime task. If there are no other threads running on this + * cpu this function will return. */ asmlinkage long sys_sched_yield(void) { @@ -4659,8 +4715,15 @@ asmlinkage long sys_sched_yield(void) schedstat_inc(rq, yld_cnt); if (rq->nr_running == 1) schedstat_inc(rq, yld_both_empty); - else - list_move_tail(&p->run_list, p->array->queue + p->prio); + else { + struct prio_array *old_array = p->array; + int old_prio = p->prio; + + /* p->prio will be updated in requeue_task via queue_expired */ + if (!rt_task(p)) + p->array = rq->expired; + requeue_task(p, rq, old_array, old_prio); + } /* * Since we are going to call schedule() anyway, there's @@ -7083,6 +7146,7 @@ void __init sched_init(void) lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; rq->prio_rotation = 0; + rq->best_static_prio = MAX_PRIO - 1; rq->prio_level = MAX_RT_PRIO; rq->active = rq->arrays; rq->expired = rq->arrays + 1; Index: linux-2.6.21-rc4-mm1/kernel/sysctl.c =================================================================== --- linux-2.6.21-rc4-mm1.orig/kernel/sysctl.c 2007-03-21 20:53:50.000000000 +1100 +++ linux-2.6.21-rc4-mm1/kernel/sysctl.c 2007-03-22 11:41:54.000000000 +1100 @@ -79,6 +79,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int print_fatal_signals; +extern int rr_interval; #if defined(CONFIG_ADAPTIVE_READAHEAD) extern int readahead_ratio; @@ -167,6 +168,13 @@ int sysctl_legacy_va_layout; #endif +/* Constants for minimum and maximum testing in vm_table. + We use these as one-element integer vectors. */ +static int __read_mostly zero; +static int __read_mostly one = 1; +static int __read_mostly one_hundred = 100; + + /* The default sysctl tables: */ static ctl_table root_table[] = { @@ -515,6 +523,17 @@ static ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rr_interval", + .data = &rr_interval, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &one_hundred, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC, @@ -631,12 +650,6 @@ static ctl_table kern_table[] = { { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY,