Add barriers before grabbing grq lock. Make enqueue not inlined and optimise idleprio/iso tests. Simplifiy rq_idle definition now that idle is always PRIO_LIMIT when not scheduled. Fix build failure for group_first_cpu. Make function types consistent. Fix adjust_deadline for when nice changes. Make task_prio clear as to how it gets +80. Allow rr_interval to be set down to 0. Remove voluntary preemption as an option and make full preempt default. Voluntary is of questionable value and breaks on bfs. --- Documentation/sysctl/kernel.txt | 11 ++++--- kernel/Kconfig.preempt | 19 ------------ kernel/sched_bfs.c | 61 ++++++++++++++++------------------------ kernel/sysctl.c | 2 - 4 files changed, 34 insertions(+), 59 deletions(-) Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-10 12:41:45.859657473 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-10 12:49:03.153657335 +1000 @@ -311,6 +311,7 @@ static inline void grq_lock(void) __acquires(grq.lock) { + smp_mb(); spin_lock(&grq.lock); } @@ -323,6 +324,7 @@ static inline void grq_lock_irq(void) __acquires(grq.lock) { + smp_mb(); spin_lock_irq(&grq.lock); } @@ -343,7 +345,7 @@ __acquires(grq.lock) { local_irq_save(*flags); - spin_lock(&grq.lock); + grq_lock(); } static inline void grq_unlock_irqrestore(unsigned long *flags) @@ -507,17 +509,12 @@ /* * Adding to the global runqueue. Enter with grq locked. */ -static inline void enqueue_task(struct task_struct *p) +static void enqueue_task(struct task_struct *p) { - if (idleprio_task(p) && !rt_task(p)) { - if (idleprio_suitable(p)) - p->prio = p->normal_prio; - else - p->prio = NORMAL_PRIO; - } - - if (iso_task(p) && !rt_task(p)) { - if (isoprio_suitable()) + if (!rt_task(p)) { + /* Check it hasn't gotten rt from PI */ + if ((idleprio_task(p) && idleprio_suitable(p)) || + (iso_task(p) && isoprio_suitable())) p->prio = p->normal_prio; else p->prio = NORMAL_PRIO; @@ -550,7 +547,7 @@ * length. CPU distribution is handled by giving different deadlines to * tasks of different priorities. */ -static int task_timeslice(struct task_struct *p) +static inline int task_timeslice(struct task_struct *p) { return (rr_interval * prio_ratio(p) / 100); } @@ -904,16 +901,7 @@ EXPORT_SYMBOL_GPL(kick_process); #endif -/* - * We need to have a special definition for an idle runqueue when testing - * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as - * a realtime task in sched_idle_next. - */ -#ifdef CONFIG_HOTPLUG_CPU -#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) -#else -#define rq_idle(rq) ((rq)->curr == (rq)->idle) -#endif +#define rq_idle(rq) ((rq)->queued_prio == PRIO_LIMIT) /* * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the @@ -1934,7 +1922,7 @@ * task that last woke up the longest ago has the earliest deadline, thus * ensuring that interactive tasks get low latency on wake up. */ -static inline unsigned long prio_deadline_diff(struct task_struct *p) +static inline int prio_deadline_diff(struct task_struct *p) { return (prio_ratio(p) * rr_interval * HZ / 1000 / 100) ? : 1; } @@ -2714,7 +2702,8 @@ */ static void adjust_deadline(struct task_struct *p, int new_prio) { - p->deadline += prio_ratios[USER_PRIO(new_prio)] - prio_ratio(p); + p->deadline += (prio_ratios[USER_PRIO(new_prio)] - prio_ratio(p)) * + rr_interval * HZ / 1000 / 100; } void set_user_nice(struct task_struct *p, long nice) @@ -2829,7 +2818,8 @@ * * This is the priority value as seen by users in /proc. * RT tasks are offset by -100. Normal tasks are centered - * around 1, value goes from 0 to +80. + * around 1, value goes from 0 (SCHED_ISO) up to 82 (nice +19 + * SCHED_IDLEPRIO). */ int task_prio(const struct task_struct *p) { @@ -2839,8 +2829,7 @@ if (prio <= 0) goto out; - /* 225 is a fudge to end up giving +80 for lowest possible prio */ - delta = (p->deadline - jiffies) * 225 / prio_ratios[39]; + delta = (p->deadline - jiffies) * 40 / longest_deadline(); if (delta > 0 && delta <= 80) prio += delta; out: @@ -4803,6 +4792,15 @@ return group; } +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + #ifdef CONFIG_NUMA /* * The init_sched_build_groups can't handle what we want to do with node @@ -4898,15 +4896,6 @@ } #endif /* CONFIG_NUMA */ -/** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_cpus(group)); -} - /* * Initialize sched groups cpu_power. * Index: linux-2.6.31-bfs/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.31-bfs.orig/Documentation/sysctl/kernel.txt 2009-09-10 12:41:45.859657473 +1000 +++ linux-2.6.31-bfs/Documentation/sysctl/kernel.txt 2009-09-10 12:42:20.086784685 +1000 @@ -350,11 +350,14 @@ This is the smallest duration that any cpu process scheduling unit will run for. Increasing this value can increase throughput of cpu bound tasks substantially but at the expense of increased latencies -overall. This value is in milliseconds and the default value chosen -depends on the number of cpus available at scheduler initialisation -with a minimum of 6. +overall. Conversely decreasing it will decrease average and maximum +latencies but at the expense of throughput. This value is in +milliseconds and the default value chosen depends on the number of +cpus available at scheduler initialisation with a minimum of 6. The +value can be set to 0 which means no more than one tick (limited +by HZ resolution). -Valid values are from 1-5000. +Valid values are from 0-5000. ============================================================== Index: linux-2.6.31-bfs/kernel/sysctl.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sysctl.c 2009-09-10 12:41:45.860658103 +1000 +++ linux-2.6.31-bfs/kernel/sysctl.c 2009-09-10 12:42:20.090782324 +1000 @@ -682,7 +682,7 @@ .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, - .extra1 = &one, + .extra1 = &zero, .extra2 = &five_thousand, }, { Index: linux-2.6.31-bfs/kernel/Kconfig.preempt =================================================================== --- linux-2.6.31-bfs.orig/kernel/Kconfig.preempt 2009-09-10 12:45:43.700783708 +1000 +++ linux-2.6.31-bfs/kernel/Kconfig.preempt 2009-09-10 12:46:14.724332227 +1000 @@ -1,7 +1,7 @@ choice prompt "Preemption Model" - default PREEMPT_NONE + default PREEMPT config PREEMPT_NONE bool "No Forced Preemption (Server)" @@ -16,23 +16,6 @@ raw processing power of the kernel, irrespective of scheduling latencies. -config PREEMPT_VOLUNTARY - bool "Voluntary Kernel Preemption (Desktop)" - help - This option reduces the latency of the kernel by adding more - "explicit preemption points" to the kernel code. These new - preemption points have been selected to reduce the maximum - latency of rescheduling, providing faster application reactions, - at the cost of slightly lower throughput. - - This allows reaction to interactive events by allowing a - low priority process to voluntarily preempt itself even if it - is in kernel mode executing a system call. This allows - applications to run more 'smoothly' even when the system is - under load. - - Select this if you are building a kernel for a desktop system. - config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" help