--- init/Kconfig | 1 kernel/sched/bfs.c | 168 +++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 132 insertions(+), 37 deletions(-) Index: linux-3.8-bfs/kernel/sched/bfs.c =================================================================== --- linux-3.8-bfs.orig/kernel/sched/bfs.c 2013-03-03 16:59:37.305509469 +1100 +++ linux-3.8-bfs/kernel/sched/bfs.c 2013-03-03 21:20:50.957613110 +1100 @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -1052,6 +1053,13 @@ static inline void deactivate_task(struc clear_sticky(p); } +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -1664,7 +1672,8 @@ static void try_to_wake_up_local(struct */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -2006,7 +2015,7 @@ context_switch(struct rq *rq, struct tas #endif /* Here we just switch the register state and the stack. */ - rcu_switch(prev, next); + context_tracking_task_switch(prev, next); switch_to(prev, next, prev); barrier(); @@ -2211,7 +2220,7 @@ static inline u64 irq_time_read(int cpu) * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void vtime_account(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -2241,7 +2250,7 @@ void vtime_account(struct task_struct *c irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(irqtime_account_irq); #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ @@ -2354,6 +2363,34 @@ static __always_inline bool steal_accoun } /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime += t->utime; + times->stime += t->stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + +/* * On each tick, see what percentage of that tick was attributed to each * component and add the percentage to the _pc values. Once a _pc value has * accumulated one tick's worth, account for that. This means the total @@ -3375,9 +3412,9 @@ asmlinkage void __sched schedule_user(vo * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. */ - rcu_user_exit(); + user_exit(); schedule(); - rcu_user_enter(); + user_enter(); } #endif @@ -3479,7 +3516,7 @@ asmlinkage void __sched preempt_schedule /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - rcu_user_exit(); + user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); @@ -4527,8 +4564,14 @@ long sched_setaffinity(pid_t pid, const goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } retval = security_task_setscheduler(p); if (retval) @@ -4959,6 +5002,7 @@ static const char stat_nam[] = TASK_STAT void sched_show_task(struct task_struct *p) { unsigned long free = 0; + int ppid; unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; @@ -4978,8 +5022,11 @@ void sched_show_task(struct task_struct #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + rcu_read_lock(); + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent), + task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); @@ -5017,6 +5064,12 @@ void show_state_filter(unsigned long sta debug_show_all_locks(); } +void dump_cpu_task(int cpu) +{ + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); +} + #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { @@ -7456,13 +7509,13 @@ void set_curr_task(int cpu, struct task_ * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -7471,6 +7524,30 @@ void thread_group_times(struct task_stru *ut = cputime.utime; *st = cputime.stime; } + +void vtime_account_system_irqsafe(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + vtime_account_system(tsk); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + + vtime_account_user(prev); + arch_vtime_task_switch(prev); +} +#endif + #else static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) { @@ -7486,11 +7563,30 @@ static cputime_t scale_utime(cputime_t u return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + + utime = curr->utime; + total = utime + curr->stime; - rtime = nsecs_to_cputime(p->sched_time); + /* + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. + */ + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -7498,39 +7594,37 @@ void task_times(struct task_struct *p, c utime = rtime; /* - * Compare with previous values, to keep monotonicity: + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, (rtime - p->prev_utime)); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); + + *ut = prev->utime; + *st = prev->stime; +} - *ut = p->prev_utime; - *st = p->prev_stime; +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = tsk_seruntime(p), + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, (rtime - sig->prev_utime)); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif Index: linux-3.8-bfs/init/Kconfig =================================================================== --- linux-3.8-bfs.orig/init/Kconfig 2013-03-03 21:23:19.931750677 +1100 +++ linux-3.8-bfs/init/Kconfig 2013-03-03 21:23:33.500581043 +1100 @@ -759,6 +759,7 @@ config NUMA_BALANCING depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY depends on SMP && NUMA && MIGRATION + depends on !SCHED_BFS help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when