--- Documentation/sysctl/kernel.txt | 12 +++ kernel/sched.c | 123 ++++++++++++++++++++++++---------------- kernel/sysctl.c | 25 ++++++-- 3 files changed, 105 insertions(+), 55 deletions(-) Index: linux-2.6.20.3-rsdl/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.20.3-rsdl.orig/Documentation/sysctl/kernel.txt 2007-02-05 22:51:59.000000000 +1100 +++ linux-2.6.20.3-rsdl/Documentation/sysctl/kernel.txt 2007-03-22 13:19:15.000000000 +1100 @@ -43,6 +43,7 @@ show up in /proc/sys/kernel: - printk - real-root-dev ==> Documentation/initrd.txt - reboot-cmd [ SPARC only ] +- rr_interval - rtsig-max - rtsig-nr - sem @@ -288,6 +289,17 @@ rebooting. ??? ============================================================== +rr_interval: + +This is the smallest duration that any cpu process scheduling unit +will run for. Increasing this value can increase throughput of cpu +bound tasks substantially but at the expense of increased latencies +overall. This value is in _ticks_ and the default value chosen depends +on the number of cpus available at scheduler initialisation. Valid +values are from 1-100. + +============================================================== + rtsig-max & rtsig-nr: The file rtsig-max can be used to tune the maximum number Index: linux-2.6.20.3-rsdl/kernel/sched.c =================================================================== --- linux-2.6.20.3-rsdl.orig/kernel/sched.c 2007-03-17 00:03:17.000000000 +1100 +++ linux-2.6.20.3-rsdl/kernel/sched.c 2007-03-22 13:20:17.000000000 +1100 @@ -17,7 +17,7 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin * 2007-03-02 Rotating Staircase deadline scheduling policy by Con Kolivas - * RSDL v0.31 + * RSDL v0.32 */ #include @@ -82,8 +82,10 @@ /* * This is the time all tasks within the same priority round robin. * Set to a minimum of 8ms. Scales with number of cpus and rounds with HZ. + * Tunable via /proc interface. */ -static unsigned int rr_interval __read_mostly; +int rr_interval __read_mostly; + #define RR_INTERVAL 8 #define DEF_TIMESLICE (rr_interval * 20) @@ -160,8 +162,11 @@ struct rq { struct prio_array *active, *expired, arrays[2]; unsigned long *dyn_bitmap, *exp_bitmap; - int prio_level; - /* The current dynamic priority level this runqueue is at */ + int prio_level, best_static_prio; + /* + * The current dynamic priority level this runqueue is at, and the + * best static priority queued this major rotation. + */ unsigned long prio_rotation; /* How many times we have rotated the priority queue */ @@ -646,19 +651,40 @@ static inline void task_new_array(struct p->rotation = rq->prio_rotation; } +/* Find the first slot from the relevant prio_matrix entry */ static inline int first_prio_slot(struct task_struct *p) { return SCHED_PRIO(find_first_zero_bit( prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); } -static inline int next_prio_slot(struct task_struct *p, int prio) +/* Is a dynamic_prio part of the allocated slots for this static_prio */ +static inline int entitled_slot(int static_prio, int dynamic_prio) +{ + return !test_bit(USER_PRIO(dynamic_prio), + prio_matrix[USER_PRIO(static_prio)]); +} + +/* + * Find the first unused slot by this task that is also in its prio_matrix + * level. Ensure that the prio_level is not unnecessarily low by checking + * that best_static_prio this major rotation was not a niced task. + * SCHED_BATCH tasks do not perform this check so they do not induce + * latencies in tasks of any nice level. + */ +static inline int next_entitled_slot(struct task_struct *p, struct rq *rq) { - DECLARE_BITMAP(tmp, PRIO_RANGE); - bitmap_or(tmp, p->bitmap, prio_matrix[USER_PRIO(p->static_prio)], - PRIO_RANGE); - return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, - USER_PRIO(prio))); + if (p->static_prio < rq->best_static_prio && p->policy != SCHED_BATCH) + return SCHED_PRIO(find_first_zero_bit(p->bitmap, PRIO_RANGE)); + else { + DECLARE_BITMAP(tmp, PRIO_RANGE); + + bitmap_or(tmp, p->bitmap, + prio_matrix[USER_PRIO(p->static_prio)], + PRIO_RANGE); + return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, + USER_PRIO(rq->prio_level))); + } } static void queue_expired(struct task_struct *p, struct rq *rq) @@ -685,23 +711,12 @@ static void queue_expired(struct task_st static void recalc_task_prio(struct task_struct *p, struct rq *rq) { struct prio_array *array = rq->active; - int queue_prio, search_prio = MAX_RT_PRIO; - - /* - * SCHED_BATCH tasks never start at better priority than any other - * task that is already running since they are flagged as latency - * insensitive. This means they never cause greater latencies in other - * non SCHED_BATCH tasks of the same nice level, but they still will - * not be exposed to high latencies themselves. - */ - if (unlikely(p->policy == SCHED_BATCH)) - search_prio = rq->prio_level; + int queue_prio; if (p->rotation == rq->prio_rotation) { if (p->array == array) { if (p->time_slice && rq_quota(rq, p->prio)) return; - search_prio = p->prio; } else if (p->array == rq->expired) { queue_expired(p, rq); return; @@ -710,7 +725,7 @@ static void recalc_task_prio(struct task } else task_new_array(p, rq); - queue_prio = next_prio_slot(p, search_prio); + queue_prio = next_entitled_slot(p, rq); if (queue_prio >= MAX_PRIO) { queue_expired(p, rq); return; @@ -867,7 +882,7 @@ static inline int normal_prio(struct tas if (has_rt_policy(p)) return MAX_RT_PRIO-1 - p->rt_priority; /* Other tasks all have normal_prio set in recalc_task_prio */ - if (likely(p->prio >= MAX_RT_PRIO)) + if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) return p->prio; else return p->static_prio; @@ -902,10 +917,10 @@ static int effective_prio(struct task_st */ static unsigned int rr_quota(struct task_struct *p) { - int neg_nice = -TASK_NICE(p), rr = rr_interval; + int nice = TASK_NICE(p), rr = rr_interval; - if (neg_nice > 6 && !rt_task(p)) { - rr *= neg_nice * neg_nice; + if (nice < -6 && !rt_task(p)) { + rr *= nice * nice; rr /= 40; } return rr; @@ -1522,7 +1537,7 @@ int fastcall wake_up_state(struct task_s return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); +static void task_running_tick(struct rq *rq, struct task_struct *p, int tick); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1584,7 +1599,7 @@ void fastcall sched_fork(struct task_str * a problem. */ current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); + task_running_tick(cpu_rq(cpu), current, 0); } local_irq_enable(); out: @@ -1659,14 +1674,16 @@ void fastcall wake_up_new_task(struct ta */ void fastcall sched_exit(struct task_struct *p) { + struct task_struct *parent; unsigned long flags; struct rq *rq; - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > p->quota)) - p->parent->time_slice = p->quota; + parent = p->parent; + rq = task_rq_lock(parent, &flags); + if (p->first_time_slice && task_cpu(p) == task_cpu(parent)) { + parent->time_slice += p->time_slice; + if (unlikely(parent->time_slice > parent->quota)) + parent->time_slice = parent->quota; } task_rq_unlock(rq, &flags); } @@ -3075,6 +3092,7 @@ static inline void major_prio_rotation(s rq->active = new_array; rq->exp_bitmap = rq->expired->prio_bitmap; rq->dyn_bitmap = rq->active->prio_bitmap; + rq->best_static_prio = MAX_PRIO; rq->prio_rotation++; } @@ -3138,7 +3156,7 @@ static inline void rotate_runqueue_prior rq_quota(rq, new_prio_level) += 1; } -static void task_running_tick(struct rq *rq, struct task_struct *p) +static void task_running_tick(struct rq *rq, struct task_struct *p, int tick) { if (unlikely(!task_queued(p))) { /* Task has expired but was not scheduled yet */ @@ -3161,6 +3179,13 @@ static void task_running_tick(struct rq if (!--p->time_slice) task_expired_entitlement(rq, p); /* + * If we're actually calling this function not in a scheduler_tick + * we are doing so to fix accounting across fork and should not be + * deducting anything from rq_quota. + */ + if (!tick) + goto out_unlock; + /* * We only employ the deadline mechanism if we run over the quota. * It allows aliasing problems around the scheduler_tick to be * less harmful. @@ -3171,6 +3196,7 @@ static void task_running_tick(struct rq rotate_runqueue_priority(rq); set_tsk_need_resched(p); } +out_unlock: spin_unlock(&rq->lock); } @@ -3191,7 +3217,7 @@ void scheduler_tick(void) /* Task on the idle queue */ wake_priority_sleeper(rq); else - task_running_tick(rq, p); + task_running_tick(rq, p, 1); #ifdef CONFIG_SMP update_load(rq); if (time_after_eq(jiffies, rq->next_balance)) @@ -3373,20 +3399,13 @@ EXPORT_SYMBOL(sub_preempt_count); #endif -/* Is a dynamic_prio part of the allocated slots for this static_prio */ -static inline int entitled_slot(int static_prio, int dynamic_prio) -{ - return !test_bit(USER_PRIO(dynamic_prio), - prio_matrix[USER_PRIO(static_prio)]); -} - /* * If a task is queued at a priority that isn't from its bitmap we exchange * by setting one of the entitlement bits. */ -static inline void exchange_slot(struct task_struct *p, int prio) +static inline void exchange_slot(struct task_struct *p, struct rq *rq) { - int slot = next_prio_slot(p, prio); + int slot = next_entitled_slot(p, rq); if (slot < MAX_PRIO) __set_bit(USER_PRIO(slot), p->bitmap); @@ -3428,6 +3447,7 @@ retry: } queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); + rq->prio_level = idx; /* * When the task is chosen it is checked to see if its quota has been * added to this runqueue level which is only performed once per @@ -3437,17 +3457,16 @@ retry: /* Task has moved during major rotation */ task_new_array(next, rq); if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, idx); + exchange_slot(next, rq); set_task_entitlement(next); rq_quota(rq, idx) += next->quota; } else if (!test_bit(USER_PRIO(idx), next->bitmap)) { /* Task has moved during minor rotation */ if (!entitled_slot(next->static_prio, idx)) - exchange_slot(next, idx); + exchange_slot(next, rq); set_task_entitlement(next); rq_quota(rq, idx) += next->quota; } - rq->prio_level = idx; /* * next needs to have its prio and array reset here in case the * values are wrong due to priority rotation. @@ -3540,8 +3559,13 @@ need_resched_nonpreemptible: if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next)) next = rq->idle; switch_tasks: - if (next == rq->idle) + if (next == rq->idle) { + rq->best_static_prio = MAX_PRIO; + rq->prio_level = MAX_RT_PRIO; + rq->prio_rotation++; schedstat_inc(rq, sched_goidle); + } else if (next->static_prio < rq->best_static_prio) + rq->best_static_prio = next->static_prio; prefetch(next); prefetch_stack(next); clear_tsk_need_resched(prev); @@ -6892,6 +6916,7 @@ void __init sched_init(void) lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; rq->prio_rotation = 0; + rq->best_static_prio = MAX_PRIO; rq->prio_level = MAX_RT_PRIO; rq->active = rq->arrays; rq->expired = rq->arrays + 1; Index: linux-2.6.20.3-rsdl/kernel/sysctl.c =================================================================== --- linux-2.6.20.3-rsdl.orig/kernel/sysctl.c 2007-02-05 22:52:04.000000000 +1100 +++ linux-2.6.20.3-rsdl/kernel/sysctl.c 2007-03-22 13:21:26.000000000 +1100 @@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int rr_interval; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -228,6 +229,13 @@ static void register_proc_table(ctl_tabl static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); #endif +/* Constants for minimum and maximum testing in vm_table. + We use these as one-element integer vectors. */ +static int __read_mostly zero; +static int __read_mostly one = 1; +static int __read_mostly one_hundred = 100; + + /* The default sysctl tables: */ static ctl_table root_table[] = { @@ -676,6 +684,17 @@ static ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rr_interval", + .data = &rr_interval, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &one_hundred, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC, @@ -784,12 +803,6 @@ static ctl_table kern_table[] = { { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY,