Make it possible to have interactivity and responsiveness at very high load levels by making deadlines offset by the fork depth from init. This has a similar effect to 'nice'ing loads that are fork heavy. 'make' is a perfect example of this and will, with fork_depth_penalty enabled, be felt as much at 'make -j24' as it normally would be with just 'make'. Note that this drastically affects CPU distribution, and also has the indirect side effect of partitioning CPU entitlement to different users as well. No assumption as to CPU distribution should be made based on past behaviour. This is achieved by separating out forks to new processes vs new threads. When a new process is detected, its fork depth is inherited from its parent across fork() and then is incremented by one. That fork_depth is then used to cause a relative offset of its deadline. This feature is enabled in this patch by default and can be optionally disabled. Threads are kept at the same fork_depth as their parent process, and can optionally have their CPU entitlement all managed as one process together by enabling the group_thread_accounting feature. This feature is disabled by default in this patch, as many desktop applications such as firefox, amarok, etc are multithreaded. By disabling this feature and enabling the fork_depth_penalty feature (default) it favours CPU towards desktop applications. Extensive testing is required to ensure this does not cause regressions in common workloads. There are two sysctls to enable/disable these features. They are in /proc/sys/kernel/ group_thread_accounting - groups CPU accounting by threads fork_depth_penalty - penalises according to depth of forking from init -ck --- include/linux/sched.h | 7 +++ kernel/sched_bfs.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sysctl.c | 20 +++++++++++ 3 files changed, 108 insertions(+), 7 deletions(-) Index: linux-2.6.36-rc7-ck1/include/linux/sched.h =================================================================== --- linux-2.6.36-rc7-ck1.orig/include/linux/sched.h 2010-10-08 09:39:38.016240768 +1100 +++ linux-2.6.36-rc7-ck1/include/linux/sched.h 2010-10-08 09:39:53.575007838 +1100 @@ -1187,10 +1187,15 @@ struct task_struct { unsigned int rt_priority; #ifdef CONFIG_SCHED_BFS int time_slice; - u64 deadline; + /* Virtual deadline in niffies, and when the deadline was set */ + u64 deadline, deadline_niffy; struct list_head run_list; u64 last_ran; u64 sched_time; /* sched_clock time spent running */ + /* Number of threads currently requesting CPU time */ + unsigned long threads_running; + /* Depth of forks from init */ + int fork_depth; unsigned long rt_timeout; #else /* CONFIG_SCHED_BFS */ Index: linux-2.6.36-rc7-ck1/kernel/sched_bfs.c =================================================================== --- linux-2.6.36-rc7-ck1.orig/kernel/sched_bfs.c 2010-10-08 09:39:37.918242270 +1100 +++ linux-2.6.36-rc7-ck1/kernel/sched_bfs.c 2010-10-08 11:16:01.382198622 +1100 @@ -139,6 +139,15 @@ int rr_interval __read_mostly = 6; int sched_iso_cpu __read_mostly = 70; /* + * group_thread_accounting - sysctl to decide whether to treat whole thread + * groups as a single entity for the purposes of CPU distribution. + */ +int group_thread_accounting __read_mostly; + +/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */ +int fork_depth_penalty __read_mostly = 1; + +/* * The relative length of deadline for each priority(nice) level. */ static int prio_ratios[PRIO_RANGE] __read_mostly; @@ -661,11 +670,29 @@ static int isoprio_suitable(void) return !grq.iso_refractory; } +static inline u64 __task_deadline_diff(struct task_struct *p); +static inline u64 task_deadline_diff(struct task_struct *p); + /* * Adding to the global runqueue. Enter with grq locked. */ static void enqueue_task(struct task_struct *p) { + s64 max_tdd = task_deadline_diff(p); + + /* + * Make sure that when we're queueing this task again that it + * doesn't have any old deadlines from when the thread group was + * being penalised and cap the deadline to the highest it could + * be, based on the current number of threads running. + */ + if (group_thread_accounting) { + max_tdd += p->group_leader->threads_running * + __task_deadline_diff(p); + } + if (p->deadline - p->deadline_niffy > max_tdd) + p->deadline = p->deadline_niffy + max_tdd; + if (!rt_task(p)) { /* Check it hasn't gotten rt from PI */ if ((idleprio_task(p) && idleprio_suitable(p)) || @@ -967,10 +994,13 @@ static int effective_prio(struct task_st } /* - * activate_task - move a task to the runqueue. Enter with grq locked. + * activate_task - move a task to the runqueue. Enter with grq locked. The + * number of threads running is stored in the group_leader struct. */ static void activate_task(struct task_struct *p, struct rq *rq) { + unsigned long *threads_running = &p->group_leader->threads_running; + update_clocks(rq); /* @@ -987,6 +1017,14 @@ static void activate_task(struct task_st p->prio = effective_prio(p); if (task_contributes_to_load(p)) grq.nr_uninterruptible--; + /* + * Adjust deadline according to number of running threads within + * this thread group. This ends up distributing CPU to the thread + * group as a single entity. + */ + ++*threads_running; + if (*threads_running > 1 && group_thread_accounting) + p->deadline += __task_deadline_diff(p); enqueue_task(p); grq.nr_running++; inc_qnr(); @@ -998,9 +1036,14 @@ static void activate_task(struct task_st */ static inline void deactivate_task(struct task_struct *p) { + unsigned long *threads_running = &p->group_leader->threads_running; + if (task_contributes_to_load(p)) grq.nr_uninterruptible++; grq.nr_running--; + --*threads_running; + if (*threads_running > 0 && group_thread_accounting) + p->deadline -= __task_deadline_diff(p); } #ifdef CONFIG_SMP @@ -1635,6 +1678,10 @@ void wake_up_new_task(struct task_struct parent = p->parent; /* Unnecessary but small chance that the parent changed CPU */ set_task_cpu(p, task_cpu(parent)); + if (!(clone_flags & CLONE_THREAD)) { + p->fork_depth++; + p->threads_running = 0; + } activate_task(p, rq); trace_sched_wakeup_new(p, 1); if (!(clone_flags & CLONE_VM) && rq->curr == parent && @@ -2524,11 +2571,20 @@ static inline u64 prio_deadline_diff(int return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); } -static inline u64 task_deadline_diff(struct task_struct *p) +static inline u64 __task_deadline_diff(struct task_struct *p) { return prio_deadline_diff(TASK_USER_PRIO(p)); } +static inline u64 task_deadline_diff(struct task_struct *p) +{ + u64 pdd = __task_deadline_diff(p); + + if (fork_depth_penalty && p->fork_depth > 1) + pdd *= p->fork_depth; + return pdd; +} + static inline u64 static_deadline_diff(int static_prio) { return prio_deadline_diff(USER_PRIO(static_prio)); @@ -2545,8 +2601,24 @@ static inline int ms_longest_deadline_di */ static void time_slice_expired(struct task_struct *p) { + u64 tdd = task_deadline_diff(p); + + /* + * We proportionately increase the deadline according to how many + * threads are running. This effectively makes a thread group have + * the same CPU as one task, no matter how many threads are running. + * time_slice_expired can be called when there may be none running + * when p is deactivated so we must explicitly test for more than 1. + */ + if (group_thread_accounting) { + unsigned long *threads_running = &p->group_leader->threads_running; + + if (*threads_running > 1) + tdd += *threads_running * __task_deadline_diff(p); + } p->time_slice = timeslice(); - p->deadline = grq.niffies + task_deadline_diff(p); + p->deadline_niffy = grq.niffies; + p->deadline = grq.niffies + tdd; } /* @@ -3513,7 +3585,7 @@ SYSCALL_DEFINE1(nice, int, increment) * * This is the priority value as seen by users in /proc. * RT tasks are offset by -100. Normal tasks are centered around 1, value goes - * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). + * from 0 (SCHED_ISO) upwards (to nice +19 SCHED_IDLEPRIO). */ int task_prio(const struct task_struct *p) { @@ -3525,8 +3597,12 @@ int task_prio(const struct task_struct * /* Convert to ms to avoid overflows */ delta = NS_TO_MS(p->deadline - grq.niffies); - delta = delta * 40 / ms_longest_deadline_diff(); - if (delta > 0 && delta <= 80) + if (fork_depth_penalty) + delta *= 4; + else + delta *= 40; + delta /= ms_longest_deadline_diff(); + if (delta > 0) prio += delta; if (idleprio_task(p)) prio += 40; Index: linux-2.6.36-rc7-ck1/kernel/sysctl.c =================================================================== --- linux-2.6.36-rc7-ck1.orig/kernel/sysctl.c 2010-10-08 09:39:11.603648964 +1100 +++ linux-2.6.36-rc7-ck1/kernel/sysctl.c 2010-10-08 09:39:53.579007778 +1100 @@ -121,6 +121,8 @@ static int __maybe_unused one_hundred = #ifdef CONFIG_SCHED_BFS extern int rr_interval; extern int sched_iso_cpu; +extern int group_thread_accounting; +extern int fork_depth_penalty; static int __read_mostly one_thousand = 1000; #endif #ifdef CONFIG_PRINTK @@ -834,6 +836,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "group_thread_accounting", + .data = &group_thread_accounting, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "fork_depth_penalty", + .data = &fork_depth_penalty, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) {