Index: linux-2.6.30-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.30-bfs.orig/kernel/sched_bfs.c 2009-09-05 23:44:18.319263265 +1000 +++ linux-2.6.30-bfs/kernel/sched_bfs.c 2009-09-06 18:01:31.548282825 +1000 @@ -182,8 +182,6 @@ #endif #endif - /* Cached timestamp set by update_cpu_clock() */ - unsigned long long most_recent_timestamp; struct task_struct *preempt_next; struct task_struct *curr, *idle; struct mm_struct *prev_mm; @@ -303,14 +301,9 @@ # define finish_arch_switch(prev) do { } while (0) #endif -/* - * This will cost if schedstats is enabled since it's done under lock. - */ static inline void update_rq_clock(struct rq *rq) { -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) rq->clock = sched_clock_cpu(cpu_of(rq)); -#endif } static inline int task_running(struct task_struct *p) @@ -605,11 +598,12 @@ } /* - * activate_task - move a task to the runqueue. Enter with grq locked. + * activate_task - move a task to the runqueue. Enter with grq locked. The rq + * doesn't really matter but gives us the local clock. */ -static void activate_task(struct task_struct *p) +static void activate_task(struct task_struct *p, struct rq *rq) { - unsigned long long now = sched_clock(); + u64 now = rq->clock; /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -982,7 +976,7 @@ if (queued_or_running(p)) goto out_running; - activate_task(p); + activate_task(p, rq); try_preempt(p, rq); success = 1; @@ -1065,7 +1059,6 @@ } else p->time_slice = 0; - p->timestamp = sched_clock(); local_irq_enable(); out: put_cpu(); @@ -1100,7 +1093,7 @@ BUG_ON(p->state != TASK_RUNNING); set_task_cpu(p, task_cpu(parent)); - activate_task(p); + activate_task(p, rq); trace_sched_wakeup_new(rq, p, 1); if (!(clone_flags & CLONE_VM) && rq->curr == parent && no_idle_cpus()) { @@ -1357,11 +1350,7 @@ unsigned long nr_uninterruptible(void) { - unsigned long nu = grq.nr_uninterruptible; - - if (unlikely (nu < 0)) - nu = 0; - return nu; + return grq.nr_uninterruptible; } unsigned long long nr_context_switches(void) @@ -1398,10 +1387,9 @@ * to just returning jiffies, and for hardware that can't do tsc. */ static void -update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, - int tick) +update_cpu_clock(struct task_struct *p, struct rq *rq, int tick) { - long time_diff = now - p->last_ran; + long time_diff = rq->clock - p->last_ran; if (tick) { /* @@ -1426,7 +1414,7 @@ if (p != rq->idle && p->policy != SCHED_FIFO) p->time_slice -= time_diff / 1000; p->sched_time += time_diff; - p->last_ran = rq->most_recent_timestamp = now; + p->last_ran = rq->clock; } /* @@ -1441,7 +1429,7 @@ if (p == rq->curr) { update_rq_clock(rq); - ns = sched_clock() - p->last_ran; + ns = rq->clock - p->last_ran; if ((s64)ns < 0) ns = 0; } @@ -1481,22 +1469,6 @@ } /* - * Return current->sched_time plus any more ns on the sched_clock - * that have not yet been banked. - */ -unsigned long long current_sched_time(const struct task_struct *p) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = p->sched_time + sched_clock() - p->last_ran; - local_irq_restore(flags); - - return ns; -} - -/* * Return sum_exec_runtime for the thread group. * In case the task is currently running, return the sum plus current's * pending runtime that have not been accounted yet. @@ -1761,7 +1733,7 @@ sched_clock_tick(); time_lock_rq(rq); p = rq->curr; - update_cpu_clock(p, rq, sched_clock(), 1); + update_cpu_clock(p, rq, 1); if (!rq_idle(rq)) task_running_tick(rq, p); else @@ -2013,9 +1985,9 @@ { struct task_struct *prev, *next, *idle; int deactivate = 0, cpu; - unsigned long long now; long *switch_count; struct rq *rq; + u64 now; cpu = smp_processor_id(); rq = this_rq(); @@ -2037,10 +2009,10 @@ dump_stack(); } - now = sched_clock(); - grq_lock_irq(); update_rq_clock(rq); + now = rq->clock; + clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -2074,7 +2046,7 @@ prefetch(next); prefetch_stack(next); - update_cpu_clock(prev, rq, now, 0); + update_cpu_clock(prev, rq, 0); prev->timestamp = prev->last_ran = now; rq->queued_prio = next->prio; @@ -2726,10 +2698,10 @@ if (prio <= 0) goto out; - delta = (p->deadline - jiffies) * 200 / prio_ratios[39]; - if (delta > 80 || delta < 0) - delta = 0; - prio += delta; + /* 225 is a fudge to end up giving +80 for lowest possible prio */ + delta = (p->deadline - jiffies) * 225 / prio_ratios[39]; + if (delta > 0 && delta <= 80) + prio += delta; out: return prio; } @@ -3246,8 +3218,7 @@ * sys_sched_yield - yield the current processor to other threads. * * This function yields the current CPU to other tasks. It does this by - * refilling the timeslice, offsetting the deadline by the remaining - * timeslice and scheduling away. + * refilling the timeslice, resetting the deadline and scheduling away. */ SYSCALL_DEFINE0(sched_yield) { @@ -3256,6 +3227,7 @@ grq_lock_irq(); p = current; schedstat_inc(this_rq(), yld_count); + update_rq_clock(task_rq(p)); time_slice_expired(p); requeue_task(p); @@ -3551,10 +3523,12 @@ struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = idle->last_ran = sched_clock(); + time_grq_lock(rq, &flags); + idle->timestamp = idle->last_ran = rq->clock; idle->state = TASK_RUNNING; + /* Setting prio to illegal value shouldn't matter when never queued */ + idle->prio = PRIO_LIMIT; idle->cpus_allowed = cpumask_of_cpu(cpu); - grq_lock_irqsave(&flags); set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; idle->oncpu = 1; @@ -3775,11 +3749,10 @@ * Strictly not necessary since rest of the CPUs are stopped by now * and interrupts disabled on the current cpu. */ - grq_lock_irqsave(&flags); + time_grq_lock(rq, &flags); __setscheduler(idle, SCHED_FIFO, MAX_RT_PRIO - 1); - update_rq_clock(rq); activate_idle_task(idle); rq->preempt_next = idle; resched_task(rq->curr); @@ -4099,6 +4072,8 @@ deactivate_task(rq->idle); rq->idle->static_prio = MAX_PRIO; __setscheduler(rq->idle, SCHED_NORMAL, 0); + rq->idle->prio = PRIO_LIMIT; + update_rq_clock(rq); grq_unlock_irq(); cpuset_unlock(); break; @@ -5690,6 +5665,7 @@ spin_lock_irqsave(&p->pi_lock, flags); rq = __task_grq_lock(p); + update_rq_clock(rq); queued = task_queued(p); if (queued) @@ -5708,7 +5684,6 @@ read_unlock_irq(&tasklist_lock); } - #endif /* CONFIG_MAGIC_SYSRQ */ #ifdef CONFIG_IA64