--- kernel/sched/bfs.c | 275 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 188 insertions(+), 87 deletions(-) Index: linux-3.9-bfs/kernel/sched/bfs.c =================================================================== --- linux-3.9-bfs.orig/kernel/sched/bfs.c 2013-05-02 22:16:30.187763678 +1000 +++ linux-3.9-bfs/kernel/sched/bfs.c 2013-05-02 22:40:35.745691717 +1000 @@ -453,6 +453,9 @@ static inline void update_clocks(struct * Looking up task_rq must be done under grq.lock to be safe. */ static void update_rq_clock_task(struct rq *rq, s64 delta); +static unsigned long long do_task_sched_runtime_nodelta(struct task_struct *p, + unsigned long long *delta); +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq); static inline void update_rq_clock(struct rq *rq) { @@ -1028,7 +1031,7 @@ static void activate_task(struct task_st if (unlikely(prof_on == SLEEP_PROFILING)) { if (p->state == TASK_UNINTERRUPTIBLE) profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (rq->clock - p->last_ran) >> 20); + (rq->clock_task - p->last_ran) >> 20); } p->prio = effective_prio(p); @@ -1690,19 +1693,15 @@ static void time_slice_expired(struct ta */ void sched_fork(struct task_struct *p) { - struct task_struct *curr; - int cpu = get_cpu(); - struct rq *rq; - #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif /* - * We mark the process as running here. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. + * The process state is set to the same value of the process executing + * do_fork() code. That is running. This guarantees that nobody will + * actually run it, and a signal or other external event cannot wake + * it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; /* Should be reset in fork.c but done here for ease of bfs patching */ p->utime = @@ -1734,20 +1733,11 @@ void sched_fork(struct task_struct *p) p->sched_reset_on_fork = 0; } - curr = current; - rq = task_grq_lock_irq(curr); - set_task_cpu(p, cpu); - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = curr->normal_prio; - INIT_LIST_HEAD(&p->run_list); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif - p->on_cpu = false; clear_sticky(p); @@ -1755,35 +1745,6 @@ void sched_fork(struct task_struct *p) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif - if (unlikely(p->policy == SCHED_FIFO)) - goto out_unlock; - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. If it's negative, it won't - * matter since that's the same as being 0. current's time_slice is - * actually in rq_time_slice when it's running, as is its last_ran - * value. rq->rq_deadline is only modified within schedule() so it - * is always equal to current->deadline. - */ - if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { - rq->rq_time_slice /= 2; - p->time_slice = rq->rq_time_slice; - } else { - /* - * Forking task has run out of timeslice. Reschedule it and - * start its child with a new time slice and deadline. The - * child will end up running first because its deadline will - * be slightly earlier. - */ - rq->rq_time_slice = 0; - set_tsk_need_resched(curr); - time_slice_expired(p); - } - p->last_ran = rq->rq_last_ran; -out_unlock: - task_grq_unlock_irq(); - put_cpu(); } /* @@ -1799,22 +1760,68 @@ void wake_up_new_task(struct task_struct unsigned long flags; struct rq *rq; - p->state = TASK_RUNNING; parent = p->parent; rq = task_grq_lock(p, &flags); - /* Unnecessary but small chance that the parent changed CPU */ - set_task_cpu(p, task_cpu(parent)); + + /* + * Reinit new task deadline as its creator deadline could have changed + * since call to dup_task_struct(). + */ + p->deadline = rq->rq_deadline; + + /* + * If the task is a new process, current and parent are the same. If + * the task is a new thread in the thread group, it will have much more + * in common with current than with the parent. + */ + set_task_cpu(p, task_cpu(rq->curr)); + + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = rq->curr->normal_prio; + activate_task(p, rq); trace_sched_wakeup_new(p, 1); - if (rq->curr == parent && !suitable_idle_cpus(p)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - resched_task(parent); - } else - try_preempt(p, rq); + if (unlikely(p->policy == SCHED_FIFO)) + goto after_ts_init; + + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. If it's negative, it won't + * matter since that's the same as being 0. current's time_slice is + * actually in rq_time_slice when it's running, as is its last_ran + * value. rq->rq_deadline is only modified within schedule() so it + * is always equal to current->deadline. + */ + p->last_ran = rq->rq_last_ran; + if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { + rq->rq_time_slice /= 2; + p->time_slice = rq->rq_time_slice; +after_ts_init: + if (rq->curr == parent && !suitable_idle_cpus(p)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + set_tsk_need_resched(parent); + } else + try_preempt(p, rq); + } else { + if (rq->curr == parent) { + /* + * Forking task has run out of timeslice. Reschedule it and + * start its child with a new time slice and deadline. The + * child will end up running first because its deadline will + * be slightly earlier. + */ + rq->rq_time_slice = 0; + set_tsk_need_resched(parent); + } + time_slice_expired(p); + } task_grq_unlock(&flags); } @@ -2363,10 +2370,14 @@ static __always_inline bool steal_accoun * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. */ -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +void thread_group_cputime_nodelta(struct task_struct *tsk, struct task_cputime *times, + unsigned long long *delta) { struct signal_struct *sig = tsk->signal; struct task_struct *t; + unsigned long long d = 0; + unsigned long long td; + unsigned long flags; times->utime = sig->utime; times->stime = sig->stime; @@ -2378,13 +2389,53 @@ void thread_group_cputime(struct task_st goto out; t = tsk; + grq_lock_irqsave(&flags); do { times->utime += t->utime; times->stime += t->stime; - times->sum_exec_runtime += task_sched_runtime(t); + times->sum_exec_runtime += do_task_sched_runtime_nodelta(t,&td); + d += td; + } while_each_thread(tsk, t); + grq_unlock_irqrestore(&flags); +out: + rcu_read_unlock(); + + if (delta) + *delta = d; +} + +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + unsigned long long d; + thread_group_cputime_nodelta(tsk,times,&d); + times->sum_exec_runtime += d; +} + +unsigned long long group_delta_exec(struct task_struct *tsk) +{ + unsigned long long ns = 0; + struct task_struct *t; + unsigned long flags; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + grq_lock_irqsave(&flags); + do { + ns += do_task_delta_exec(t, task_rq(t)); } while_each_thread(tsk, t); + grq_unlock_irqrestore(&flags); out: rcu_read_unlock(); + + return ns; } /* @@ -2430,6 +2481,16 @@ pc_system_time(struct rq *rq, struct tas account_group_system_time(p, cputime_one_jiffy * jiffs); } p->sched_time += ns; + /* + * Do not update the cputimer if the task is already released by + * release_task(). + * + * This could be executed if a tick happens when a task is inside + * do_exit() between the call to release_task() and its final + * schedule() call for autoreaping tasks. + */ + if (likely(p->sighand)) + account_group_exec_runtime(p, ns); if (hardirq_count() - hardirq_offset) { rq->irq_pc += pc; @@ -2469,6 +2530,15 @@ static void pc_user_time(struct rq *rq, account_group_user_time(p, cputime_one_jiffy * jiffs); } p->sched_time += ns; + /* + * Do not update the cputimer if the task is already released by + * release_task(). + * + * it would preferable to defer the autoreap release_task + * after the last context switch but harder to do. + */ + if (likely(p->sighand)) + account_group_exec_runtime(p, ns); if (this_cpu_ksoftirqd() == p) { /* @@ -2508,12 +2578,11 @@ static void pc_user_time(struct rq *rq, * This is called on clock ticks. * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. - * It is inline because it is invoked inconditionally from only 1 location. */ -static inline void +static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock - rq->timekeep_clock; + long account_ns = rq->clock_task - rq->rq_last_ran; struct task_struct *idle = rq->idle; unsigned long account_pc; @@ -2534,31 +2603,28 @@ update_cpu_clock_tick(struct rq *rq, str if (sched_clock_irqtime) irqtime_account_hi_si(); - if (p != idle) - account_group_exec_runtime(p, account_ns); - ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->rq_last_ran; + s64 time_diff = rq->clock - rq->timekeep_clock; niffy_diff(&time_diff, 1); rq->rq_time_slice -= NS_TO_US(time_diff); } - rq->rq_last_ran = rq->timekeep_clock = rq->clock; + rq->rq_last_ran = rq->clock_task; + rq->timekeep_clock = rq->clock; } /* * This is called on context switches. - * Bank in p->sched_time the ns elapsed since the last tickk or switch. + * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. - * It is inline because it is invoked inconditionally from only 1 location. */ -static inline void +static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock - rq->timekeep_clock; + long account_ns = rq->clock_task - rq->rq_last_ran; struct task_struct *idle = rq->idle; unsigned long account_pc; @@ -2570,7 +2636,6 @@ update_cpu_clock_switch(struct rq *rq, s /* Accurate subtick timekeeping */ if (p != idle) { pc_user_time(rq, p, account_pc, account_ns); - account_group_exec_runtime(p, account_ns); } else pc_idle_time(rq, idle, account_pc); @@ -2578,13 +2643,14 @@ update_cpu_clock_switch(struct rq *rq, s ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->rq_last_ran; + s64 time_diff = rq->clock - rq->timekeep_clock; niffy_diff(&time_diff, 1); rq->rq_time_slice -= NS_TO_US(time_diff); } - rq->rq_last_ran = rq->timekeep_clock = rq->clock; + rq->rq_last_ran = rq->clock_task; + rq->timekeep_clock = rq->clock; } /* @@ -2622,22 +2688,57 @@ unsigned long long task_delta_exec(struc /* * Return accounted runtime for the task. - * In case the task is currently running, return the runtime plus current's - * pending runtime that have not been accounted yet. + * Return separately the current's pending runtime that have not been + * accounted yet. + * + * grq lock already acquired. */ -unsigned long long task_sched_runtime(struct task_struct *p) +static unsigned long long do_task_sched_runtime_nodelta(struct task_struct *p, + unsigned long long *delta) +{ + struct rq *rq; + u64 ns; + + rq = task_rq(p); + ns = p->sched_time; + *delta = do_task_delta_exec(p, rq); + + return ns; +} + +/* + * Return accounted runtime for the task. + * Return separately the current's pending runtime that have not been + * accounted yet. + */ +unsigned long long task_sched_runtime_nodelta(struct task_struct *p, unsigned long long *delta) { unsigned long flags; struct rq *rq; u64 ns; rq = task_grq_lock(p, &flags); - ns = p->sched_time + do_task_delta_exec(p, rq); + ns = p->sched_time; + *delta = do_task_delta_exec(p, rq); task_grq_unlock(&flags); return ns; } +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long long delta; + u64 ns = task_sched_runtime_nodelta(p, &delta); + + ns += delta; + return ns; +} + /* Compatibility crap */ void account_user_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled) @@ -3229,7 +3330,7 @@ static inline void set_rq_task(struct rq { rq->rq_time_slice = p->time_slice; rq->rq_deadline = p->deadline; - rq->rq_last_ran = p->last_ran = rq->clock; + rq->rq_last_ran = p->last_ran = rq->clock_task; rq->rq_policy = p->policy; rq->rq_prio = p->prio; if (p != rq->idle) @@ -3355,7 +3456,7 @@ need_resched: prev->time_slice = rq->rq_time_slice; prev->deadline = rq->rq_deadline; check_deadline(prev); - prev->last_ran = rq->clock; + prev->last_ran = rq->clock_task; /* Task changed affinity off this CPU */ if (needs_other_cpu(prev, cpu)) { @@ -5169,7 +5270,7 @@ void init_idle(struct task_struct *idle, unsigned long flags; time_grq_lock(rq, &flags); - idle->last_ran = rq->clock; + idle->last_ran = rq->clock_task; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ idle->prio = PRIO_LIMIT; @@ -7510,13 +7611,13 @@ void normalize_rt_tasks(void) struct rq *rq; int queued; - read_lock_irq(&tasklist_lock); + read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { if (!rt_task(p) && !iso_task(p)) continue; - raw_spin_lock_irqsave(&p->pi_lock, flags); + raw_spin_lock(&p->pi_lock); rq = __task_grq_lock(p); queued = task_queued(p); @@ -7529,10 +7630,10 @@ void normalize_rt_tasks(void) } __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + raw_spin_unlock(&p->pi_lock); } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + read_unlock_irqrestore(&tasklist_lock, flags); } #endif /* CONFIG_MAGIC_SYSRQ */