Two knobs. group_thread_accounting groups CPU accounting by threads fork_depth_penalty penalises according to depth of forking from init -ck --- include/linux/init_task.h | 1 include/linux/sched.h | 7 +++- kernel/sched_bfs.c | 74 ++++++++++++++++++++++++++++++++++++++++++---- kernel/sysctl.c | 20 ++++++++++++ 4 files changed, 96 insertions(+), 6 deletions(-) Index: linux-2.6.35.7/include/linux/sched.h =================================================================== --- linux-2.6.35.7.orig/include/linux/sched.h 2010-10-06 08:35:33.607634739 +1100 +++ linux-2.6.35.7/include/linux/sched.h 2010-10-06 22:48:42.053820056 +1100 @@ -1192,10 +1192,15 @@ struct task_struct { unsigned int rt_priority; #ifdef CONFIG_SCHED_BFS int time_slice; - u64 deadline; + /* Virtual deadline in niffies, and when the deadline was set */ + u64 deadline, deadline_niffy; struct list_head run_list; u64 last_ran; u64 sched_time; /* sched_clock time spent running */ + /* Number of threads currently requesting CPU time */ + unsigned long threads_running; + /* Depth of forks from init */ + int fork_depth; unsigned long rt_timeout; #else /* CONFIG_SCHED_BFS */ Index: linux-2.6.35.7/kernel/sched_bfs.c =================================================================== --- linux-2.6.35.7.orig/kernel/sched_bfs.c 2010-10-06 08:35:33.601634349 +1100 +++ linux-2.6.35.7/kernel/sched_bfs.c 2010-10-06 23:48:25.592170468 +1100 @@ -137,6 +137,15 @@ int rr_interval __read_mostly = 6; int sched_iso_cpu __read_mostly = 70; /* + * group_thread_accounting - sysctl to decide whether to treat whole thread + * groups as a single entity for the purposes of CPU distribution. + */ +int group_thread_accounting __read_mostly; + +/* fork_depth_penalty - Whether to penalise CPU according to fork depth. */ +int fork_depth_penalty __read_mostly = 1; + +/* * The relative length of deadline for each priority(nice) level. */ static int prio_ratios[PRIO_RANGE] __read_mostly; @@ -635,11 +644,26 @@ static int isoprio_suitable(void) return !grq.iso_refractory; } +static inline u64 task_deadline_diff(struct task_struct *p); + /* * Adding to the global runqueue. Enter with grq locked. */ static void enqueue_task(struct task_struct *p) { + s64 max_tdd = task_deadline_diff(p); + + /* + * Make sure that when we're queueing this task again that it + * doesn't have any old deadlines from when the thread group was + * being penalised and cap the deadline to the highest it could + * be, based on the current number of threads running. + */ + if (group_thread_accounting) + max_tdd *= p->group_leader->threads_running; + if (p->deadline - p->deadline_niffy > max_tdd) + p->deadline = p->deadline_niffy + max_tdd; + if (!rt_task(p)) { /* Check it hasn't gotten rt from PI */ if ((idleprio_task(p) && idleprio_suitable(p)) || @@ -939,10 +963,13 @@ static int effective_prio(struct task_st } /* - * activate_task - move a task to the runqueue. Enter with grq locked. + * activate_task - move a task to the runqueue. Enter with grq locked. The + * number of threads running is stored in the group_leader struct. */ static void activate_task(struct task_struct *p, struct rq *rq) { + unsigned long *threads_running = &p->group_leader->threads_running; + update_clocks(rq); /* @@ -959,6 +986,14 @@ static void activate_task(struct task_st p->prio = effective_prio(p); if (task_contributes_to_load(p)) grq.nr_uninterruptible--; + /* + * Adjust deadline according to number of running threads within + * this thread group. This ends up distributing CPU to the thread + * group as a single entity. + */ + ++*threads_running; + if (*threads_running > 1 && group_thread_accounting) + p->deadline += task_deadline_diff(p); enqueue_task(p); grq.nr_running++; inc_qnr(); @@ -970,9 +1005,14 @@ static void activate_task(struct task_st */ static inline void deactivate_task(struct task_struct *p) { + unsigned long *threads_running = &p->group_leader->threads_running; + if (task_contributes_to_load(p)) grq.nr_uninterruptible++; grq.nr_running--; + --*threads_running; + if (*threads_running > 0 && group_thread_accounting) + p->deadline -= task_deadline_diff(p); } #ifdef CONFIG_SMP @@ -1556,6 +1596,10 @@ void wake_up_new_task(struct task_struct parent = p->parent; /* Unnecessary but small chance that the parent changed CPU */ set_task_cpu(p, task_cpu(parent)); + if (!(clone_flags & CLONE_THREAD)) { + p->fork_depth++; + p->threads_running = 0; + } activate_task(p, rq); trace_sched_wakeup_new(p, 1); if (!(clone_flags & CLONE_VM) && rq->curr == parent && @@ -2452,7 +2496,11 @@ static inline u64 prio_deadline_diff(int static inline u64 task_deadline_diff(struct task_struct *p) { - return prio_deadline_diff(TASK_USER_PRIO(p)); + u64 pdd = prio_deadline_diff(TASK_USER_PRIO(p)); + + if (fork_depth_penalty) + pdd *= p->fork_depth; + return pdd; } static inline u64 static_deadline_diff(int static_prio) @@ -2471,8 +2519,24 @@ static inline int ms_longest_deadline_di */ static void time_slice_expired(struct task_struct *p) { + u64 tdd = task_deadline_diff(p); + + /* + * We proportionately increase the deadline according to how many + * threads are running. This effectively makes a thread group have + * the same CPU as one task, no matter how many threads are running. + * time_slice_expired can be called when there may be none running + * when p is deactivated so we must explicitly test for more than 1. + */ + if (group_thread_accounting) { + unsigned long *threads_running = &p->group_leader->threads_running; + + if (*threads_running > 1) + tdd *= *threads_running; + } p->time_slice = timeslice(); - p->deadline = grq.niffies + task_deadline_diff(p); + p->deadline_niffy = grq.niffies; + p->deadline = grq.niffies + tdd; } /* @@ -3426,7 +3490,7 @@ SYSCALL_DEFINE1(nice, int, increment) * * This is the priority value as seen by users in /proc. * RT tasks are offset by -100. Normal tasks are centered around 1, value goes - * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). + * from 0 (SCHED_ISO) up to ~900 (nice +19 SCHED_IDLEPRIO). */ int task_prio(const struct task_struct *p) { @@ -3439,7 +3503,7 @@ int task_prio(const struct task_struct * /* Convert to ms to avoid overflows */ delta = NS_TO_MS(p->deadline - grq.niffies); delta = delta * 40 / ms_longest_deadline_diff(); - if (delta > 0 && delta <= 80) + if (delta > 0) prio += delta; if (idleprio_task(p)) prio += 40; Index: linux-2.6.35.7/kernel/sysctl.c =================================================================== --- linux-2.6.35.7.orig/kernel/sysctl.c 2010-10-06 14:06:08.357049153 +1100 +++ linux-2.6.35.7/kernel/sysctl.c 2010-10-06 23:15:01.139585267 +1100 @@ -119,6 +119,8 @@ static int __maybe_unused one_hundred = #ifdef CONFIG_SCHED_BFS extern int rr_interval; extern int sched_iso_cpu; +extern int group_thread_accounting; +extern int fork_depth_penalty; static int __read_mostly one_thousand = 1000; #endif #ifdef CONFIG_PRINTK @@ -805,6 +807,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "group_thread_accounting", + .data = &group_thread_accounting, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "fork_depth_penalty", + .data = &fork_depth_penalty, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { Index: linux-2.6.35.7/include/linux/init_task.h =================================================================== --- linux-2.6.35.7.orig/include/linux/init_task.h 2010-10-06 22:46:20.315739164 +1100 +++ linux-2.6.35.7/include/linux/init_task.h 2010-10-06 23:09:56.242262268 +1100 @@ -124,6 +124,7 @@ extern struct cred init_cred; .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .time_slice = HZ, \ + .fork_depth = 1, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \