---
 kernel/sched/bfs.c |  275 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 188 insertions(+), 87 deletions(-)

Index: linux-3.9-bfs/kernel/sched/bfs.c
===================================================================
--- linux-3.9-bfs.orig/kernel/sched/bfs.c	2013-05-02 22:16:30.187763678 +1000
+++ linux-3.9-bfs/kernel/sched/bfs.c	2013-05-02 22:40:35.745691717 +1000
@@ -453,6 +453,9 @@ static inline void update_clocks(struct
  * Looking up task_rq must be done under grq.lock to be safe.
  */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
+static unsigned long long do_task_sched_runtime_nodelta(struct task_struct *p,
+						 unsigned long long *delta);
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq);
 
 static inline void update_rq_clock(struct rq *rq)
 {
@@ -1028,7 +1031,7 @@ static void activate_task(struct task_st
 	if (unlikely(prof_on == SLEEP_PROFILING)) {
 		if (p->state == TASK_UNINTERRUPTIBLE)
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
-				     (rq->clock - p->last_ran) >> 20);
+				     (rq->clock_task - p->last_ran) >> 20);
 	}
 
 	p->prio = effective_prio(p);
@@ -1690,19 +1693,15 @@ static void time_slice_expired(struct ta
  */
 void sched_fork(struct task_struct *p)
 {
-	struct task_struct *curr;
-	int cpu = get_cpu();
-	struct rq *rq;
-
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 	/*
-	 * We mark the process as running here. This guarantees that
-	 * nobody will actually run it, and a signal or other external
-	 * event cannot wake it up and insert it on the runqueue either.
+	 * The process state is set to the same value of the process executing
+	 * do_fork() code. That is running. This guarantees that nobody will
+	 * actually run it, and a signal or other external event cannot wake
+	 * it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_RUNNING;
 
 	/* Should be reset in fork.c but done here for ease of bfs patching */
 	p->utime =
@@ -1734,20 +1733,11 @@ void sched_fork(struct task_struct *p)
 		p->sched_reset_on_fork = 0;
 	}
 
-	curr = current;
-	rq = task_grq_lock_irq(curr);
-	set_task_cpu(p, cpu);
-	/*
-	 * Make sure we do not leak PI boosting priority to the child.
-	 */
-	p->prio = curr->normal_prio;
-
 	INIT_LIST_HEAD(&p->run_list);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-
 	p->on_cpu = false;
 	clear_sticky(p);
 
@@ -1755,35 +1745,6 @@ void sched_fork(struct task_struct *p)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
-	if (unlikely(p->policy == SCHED_FIFO))
-		goto out_unlock;
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness. If it's negative, it won't
-	 * matter since that's the same as being 0. current's time_slice is
-	 * actually in rq_time_slice when it's running, as is its last_ran
-	 * value. rq->rq_deadline is only modified within schedule() so it
-	 * is always equal to current->deadline.
-	 */
-	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
-		rq->rq_time_slice /= 2;
-		p->time_slice = rq->rq_time_slice;
-	} else {
-		/*
-		 * Forking task has run out of timeslice. Reschedule it and
-		 * start its child with a new time slice and deadline. The
-		 * child will end up running first because its deadline will
-		 * be slightly earlier.
-		 */
-		rq->rq_time_slice = 0;
-		set_tsk_need_resched(curr);
-		time_slice_expired(p);
-	}
-	p->last_ran = rq->rq_last_ran;
-out_unlock:
-	task_grq_unlock_irq();
-	put_cpu();
 }
 
 /*
@@ -1799,22 +1760,68 @@ void wake_up_new_task(struct task_struct
 	unsigned long flags;
 	struct rq *rq;
 
-	p->state = TASK_RUNNING;
 	parent = p->parent;
 	rq = task_grq_lock(p, &flags);
-	/* Unnecessary but small chance that the parent changed CPU */
-	set_task_cpu(p, task_cpu(parent));
+
+	/*
+	 * Reinit new task deadline as its creator deadline could have changed
+	 * since call to dup_task_struct().
+	 */
+	p->deadline = rq->rq_deadline;
+
+	/*
+	 * If the task is a new process, current and parent are the same. If
+	 * the task is a new thread in the thread group, it will have much more
+	 * in common with current than with the parent.
+	 */
+	set_task_cpu(p, task_cpu(rq->curr));
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = rq->curr->normal_prio;
+
 	activate_task(p, rq);
 	trace_sched_wakeup_new(p, 1);
-	if (rq->curr == parent && !suitable_idle_cpus(p)) {
-		/*
-		 * The VM isn't cloned, so we're in a good position to
-		 * do child-runs-first in anticipation of an exec. This
-		 * usually avoids a lot of COW overhead.
-		 */
-		resched_task(parent);
-	} else
-		try_preempt(p, rq);
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto after_ts_init;
+
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness. If it's negative, it won't
+	 * matter since that's the same as being 0. current's time_slice is
+	 * actually in rq_time_slice when it's running, as is its last_ran
+	 * value. rq->rq_deadline is only modified within schedule() so it
+	 * is always equal to current->deadline.
+	 */
+	p->last_ran = rq->rq_last_ran;
+	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+		rq->rq_time_slice /= 2;
+		p->time_slice = rq->rq_time_slice;
+after_ts_init:
+		if (rq->curr == parent && !suitable_idle_cpus(p)) {
+			/*
+			 * The VM isn't cloned, so we're in a good position to
+			 * do child-runs-first in anticipation of an exec. This
+			 * usually avoids a lot of COW overhead.
+			 */
+			set_tsk_need_resched(parent);
+		} else
+			try_preempt(p, rq);
+	} else {
+		if (rq->curr == parent) {
+			/*
+		 	* Forking task has run out of timeslice. Reschedule it and
+		 	* start its child with a new time slice and deadline. The
+		 	* child will end up running first because its deadline will
+		 	* be slightly earlier.
+		 	*/
+			rq->rq_time_slice = 0;
+			set_tsk_need_resched(parent);
+		}
+		time_slice_expired(p);
+	}
 	task_grq_unlock(&flags);
 }
 
@@ -2363,10 +2370,14 @@ static __always_inline bool steal_accoun
  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
  * tasks (sum on group iteration) belonging to @tsk's group.
  */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+void thread_group_cputime_nodelta(struct task_struct *tsk, struct task_cputime *times,
+				  unsigned long long *delta)
 {
 	struct signal_struct *sig = tsk->signal;
 	struct task_struct *t;
+	unsigned long long d = 0;
+	unsigned long long td;
+	unsigned long flags;
 
 	times->utime = sig->utime;
 	times->stime = sig->stime;
@@ -2378,13 +2389,53 @@ void thread_group_cputime(struct task_st
 		goto out;
 
 	t = tsk;
+	grq_lock_irqsave(&flags);
 	do {
 		times->utime += t->utime;
 		times->stime += t->stime;
-		times->sum_exec_runtime += task_sched_runtime(t);
+		times->sum_exec_runtime += do_task_sched_runtime_nodelta(t,&td);
+		d += td;
+	} while_each_thread(tsk, t);
+	grq_unlock_irqrestore(&flags);
+out:
+	rcu_read_unlock();
+
+	if (delta)
+		*delta = d;
+}
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+	unsigned long long d;
+	thread_group_cputime_nodelta(tsk,times,&d);
+	times->sum_exec_runtime += d;
+}
+
+unsigned long long group_delta_exec(struct task_struct *tsk)
+{
+	unsigned long long ns = 0;
+	struct task_struct *t;
+	unsigned long flags;
+
+	rcu_read_lock();
+	/* make sure we can trust tsk->thread_group list */
+	if (!likely(pid_alive(tsk)))
+		goto out;
+
+	t = tsk;
+	grq_lock_irqsave(&flags);
+	do {
+		ns += do_task_delta_exec(t, task_rq(t));
 	} while_each_thread(tsk, t);
+	grq_unlock_irqrestore(&flags);
 out:
 	rcu_read_unlock();
+
+	return ns;
 }
 
 /*
@@ -2430,6 +2481,16 @@ pc_system_time(struct rq *rq, struct tas
 		account_group_system_time(p, cputime_one_jiffy * jiffs);
 	}
 	p->sched_time += ns;
+	/*
+	 * Do not update the cputimer if the task is already released by
+	 * release_task().
+	 *
+	 * This could be executed if a tick happens when a task is inside
+	 * do_exit() between the call to release_task() and its final
+	 * schedule() call for autoreaping tasks.
+	 */
+	if (likely(p->sighand))
+		account_group_exec_runtime(p, ns);
 
 	if (hardirq_count() - hardirq_offset) {
 		rq->irq_pc += pc;
@@ -2469,6 +2530,15 @@ static void pc_user_time(struct rq *rq,
 		account_group_user_time(p, cputime_one_jiffy * jiffs);
 	}
 	p->sched_time += ns;
+	/*
+	 * Do not update the cputimer if the task is already released by
+	 * release_task().
+	 *
+	 * it would preferable to defer the autoreap release_task
+	 * after the last context switch but harder to do.
+	 */
+	if (likely(p->sighand))
+		account_group_exec_runtime(p, ns);
 
 	if (this_cpu_ksoftirqd() == p) {
 		/*
@@ -2508,12 +2578,11 @@ static void pc_user_time(struct rq *rq,
  * This is called on clock ticks.
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
  * CPU scheduler quota accounting is also performed here in microseconds.
- * It is inline because it is invoked inconditionally from only 1 location.
  */
-static inline void
+static void
 update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
 {
-	long account_ns = rq->clock - rq->timekeep_clock;
+	long account_ns = rq->clock_task - rq->rq_last_ran;
 	struct task_struct *idle = rq->idle;
 	unsigned long account_pc;
 
@@ -2534,31 +2603,28 @@ update_cpu_clock_tick(struct rq *rq, str
 	if (sched_clock_irqtime)
 		irqtime_account_hi_si();
 
-	if (p != idle)
-		account_group_exec_runtime(p, account_ns);
-
 ts_account:
 	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
 	if (rq->rq_policy != SCHED_FIFO && p != idle) {
-		s64 time_diff = rq->clock - rq->rq_last_ran;
+		s64 time_diff = rq->clock - rq->timekeep_clock;
 
 		niffy_diff(&time_diff, 1);
 		rq->rq_time_slice -= NS_TO_US(time_diff);
 	}
 
-	rq->rq_last_ran = rq->timekeep_clock = rq->clock;
+	rq->rq_last_ran = rq->clock_task;
+	rq->timekeep_clock = rq->clock;
 }
 
 /*
  * This is called on context switches.
- * Bank in p->sched_time the ns elapsed since the last tickk or switch.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
  * CPU scheduler quota accounting is also performed here in microseconds.
- * It is inline because it is invoked inconditionally from only 1 location.
  */
-static inline void
+static void
 update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
 {
-	long account_ns = rq->clock - rq->timekeep_clock;
+	long account_ns = rq->clock_task - rq->rq_last_ran;
 	struct task_struct *idle = rq->idle;
 	unsigned long account_pc;
 
@@ -2570,7 +2636,6 @@ update_cpu_clock_switch(struct rq *rq, s
 	/* Accurate subtick timekeeping */
 	if (p != idle) {
 		pc_user_time(rq, p, account_pc, account_ns);
-		account_group_exec_runtime(p, account_ns);
 	}
 	else
 		pc_idle_time(rq, idle, account_pc);
@@ -2578,13 +2643,14 @@ update_cpu_clock_switch(struct rq *rq, s
 ts_account:
 	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
 	if (rq->rq_policy != SCHED_FIFO && p != idle) {
-		s64 time_diff = rq->clock - rq->rq_last_ran;
+		s64 time_diff = rq->clock - rq->timekeep_clock;
 
 		niffy_diff(&time_diff, 1);
 		rq->rq_time_slice -= NS_TO_US(time_diff);
 	}
 
-	rq->rq_last_ran = rq->timekeep_clock = rq->clock;
+	rq->rq_last_ran = rq->clock_task;
+	rq->timekeep_clock = rq->clock;
 }
 
 /*
@@ -2622,22 +2688,57 @@ unsigned long long task_delta_exec(struc
 
 /*
  * Return accounted runtime for the task.
- * In case the task is currently running, return the runtime plus current's
- * pending runtime that have not been accounted yet.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ *
+ * grq lock already acquired.
  */
-unsigned long long task_sched_runtime(struct task_struct *p)
+static unsigned long long do_task_sched_runtime_nodelta(struct task_struct *p,
+						 unsigned long long *delta)
+{
+	struct rq *rq;
+	u64 ns;
+
+	rq = task_rq(p);
+	ns = p->sched_time;
+	*delta = do_task_delta_exec(p, rq);
+
+	return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ */
+unsigned long long task_sched_runtime_nodelta(struct task_struct *p, unsigned long long *delta)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns;
 
 	rq = task_grq_lock(p, &flags);
-	ns = p->sched_time + do_task_delta_exec(p, rq);
+	ns = p->sched_time;
+	*delta = do_task_delta_exec(p, rq);
 	task_grq_unlock(&flags);
 
 	return ns;
 }
 
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+	unsigned long long delta;
+	u64 ns = task_sched_runtime_nodelta(p, &delta);
+
+	ns += delta;
+	return ns;
+}
+
 /* Compatibility crap */
 void account_user_time(struct task_struct *p, cputime_t cputime,
 		       cputime_t cputime_scaled)
@@ -3229,7 +3330,7 @@ static inline void set_rq_task(struct rq
 {
 	rq->rq_time_slice = p->time_slice;
 	rq->rq_deadline = p->deadline;
-	rq->rq_last_ran = p->last_ran = rq->clock;
+	rq->rq_last_ran = p->last_ran = rq->clock_task;
 	rq->rq_policy = p->policy;
 	rq->rq_prio = p->prio;
 	if (p != rq->idle)
@@ -3355,7 +3456,7 @@ need_resched:
 		prev->time_slice = rq->rq_time_slice;
 		prev->deadline = rq->rq_deadline;
 		check_deadline(prev);
-		prev->last_ran = rq->clock;
+		prev->last_ran = rq->clock_task;
 
 		/* Task changed affinity off this CPU */
 		if (needs_other_cpu(prev, cpu)) {
@@ -5169,7 +5270,7 @@ void init_idle(struct task_struct *idle,
 	unsigned long flags;
 
 	time_grq_lock(rq, &flags);
-	idle->last_ran = rq->clock;
+	idle->last_ran = rq->clock_task;
 	idle->state = TASK_RUNNING;
 	/* Setting prio to illegal value shouldn't matter when never queued */
 	idle->prio = PRIO_LIMIT;
@@ -7510,13 +7611,13 @@ void normalize_rt_tasks(void)
 	struct rq *rq;
 	int queued;
 
-	read_lock_irq(&tasklist_lock);
+	read_lock_irqsave(&tasklist_lock, flags);
 
 	do_each_thread(g, p) {
 		if (!rt_task(p) && !iso_task(p))
 			continue;
 
-		raw_spin_lock_irqsave(&p->pi_lock, flags);
+		raw_spin_lock(&p->pi_lock);
 		rq = __task_grq_lock(p);
 
 		queued = task_queued(p);
@@ -7529,10 +7630,10 @@ void normalize_rt_tasks(void)
 		}
 
 		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		raw_spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);
 
-	read_unlock_irq(&tasklist_lock);
+	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */