Implement logic to work with sync wakeups. Microoptimise around grq locking. Move the task accounting into the runqueue it's on while running, thereby allowing scheduler_tick to run lockless most of the time. Since all CPUs will be trying to grab grq_lock on every scheduler_tick at precisely the same moment, this is a common place for lock contention. Do the externally visible accounting lockless. Occasional wrong values aren't critical. Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-11 10:18:23.217759418 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-09-11 14:54:11.475929427 +1000 @@ -78,12 +78,14 @@ #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) +#define rt_queue(rq) rt_prio((rq)->rq_prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) #define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) #define iso_task(p) unlikely((p)->policy == SCHED_ISO) +#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) #define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1) /* @@ -183,8 +185,13 @@ struct task_struct *curr, *idle; struct mm_struct *prev_mm; - unsigned long queued_deadline; - int queued_prio; + /* Stored data about rq->curr to work outside grq lock */ + unsigned long rq_deadline; + unsigned long rq_last_ran; + unsigned int rq_policy; + int rq_time_slice; + u64 rq_sched_time; + int rq_prio; atomic_t nr_iowait; @@ -328,11 +335,11 @@ spin_lock_irq(&grq.lock); } -static inline void time_lock_rq(struct rq *rq) +static inline void time_lock_grq(struct rq *rq) __acquires(grq.lock) { - grq_lock(); update_rq_clock(rq); + grq_lock(); } static inline void grq_unlock_irq(void) @@ -369,9 +376,10 @@ { struct rq *rq; - grq_lock_irqsave(flags); rq = task_rq(p); + local_irq_save(*flags); update_rq_clock(rq); + grq_lock(); return rq; } @@ -404,7 +412,7 @@ __acquires(grq.lock) { local_irq_save(*flags); - time_lock_rq(rq); + time_lock_grq(rq); } static inline struct rq *__task_grq_lock(struct task_struct *p) @@ -820,7 +828,7 @@ } /* - * Ok, time to look more closely! We need the rq + * Ok, time to look more closely! We need the grq * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ @@ -901,7 +909,7 @@ EXPORT_SYMBOL_GPL(kick_process); #endif -#define rq_idle(rq) ((rq)->queued_prio == PRIO_LIMIT) +#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) /* * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the @@ -916,7 +924,7 @@ if (p->prio < curr->prio) preempts = 1; else if (p->policy == SCHED_NORMAL && (p->prio == curr->prio && - p->deadline < rq->queued_deadline)) + p->deadline < rq->rq_deadline)) preempts = 1; return preempts; } @@ -938,14 +946,14 @@ } /* Use this_rq as baseline and fall back on */ - latest_deadline = this_rq->queued_deadline; - lowest_prio = this_rq->queued_prio; + latest_deadline = this_rq->rq_deadline; + lowest_prio = this_rq->rq_prio; lowest_prio_rq = this_rq; cpus_and(tmp, cpu_online_map, p->cpus_allowed); for_each_cpu_mask(cpu, tmp) { - unsigned long queued_deadline; - int queued_prio; + unsigned long rq_deadline; + int rq_prio; struct rq *rq; rq = cpu_rq(cpu); @@ -956,13 +964,13 @@ goto found_rq; } - queued_prio = rq->queued_prio; - queued_deadline = rq->queued_deadline; - if (queued_prio < lowest_prio || - (queued_prio == lowest_prio && - queued_deadline > latest_deadline)) { - lowest_prio = queued_prio; - latest_deadline = queued_deadline; + rq_prio = rq->rq_prio; + rq_deadline = rq->rq_deadline; + if (rq_prio < lowest_prio || + (rq_prio == lowest_prio && + rq_deadline > latest_deadline)) { + lowest_prio = rq_prio; + latest_deadline = rq_deadline; lowest_prio_rq = rq; } } @@ -996,11 +1004,23 @@ preempt_enable(); } +#ifdef CONFIG_SMP +static int no_idle_cpus(void) +{ + return (cpus_empty(grq.cpu_idle_map)); +} +#else +static int no_idle_cpus(void) +{ + return 1; +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread * @state: the mask of task states that can be woken - * sync is ignored on bfs + * @sync: do a synchronous wakeup? * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -1010,7 +1030,7 @@ * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state) +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { unsigned long flags; int success = 0; @@ -1026,7 +1046,14 @@ goto out_running; activate_task(p, rq); - try_preempt(p, rq); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption if there are no idle cpus, + * instead waiting for current to deschedule. + */ + if (!sync || (sync && !no_idle_cpus())) + try_preempt(p, rq); success = 1; out_running: @@ -1050,13 +1077,13 @@ */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL); + return try_to_wake_up(p, TASK_ALL, 0); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state); + return try_to_wake_up(p, state, 0); } /* @@ -1124,18 +1151,6 @@ put_cpu(); } -#ifdef CONFIG_SMP -static int no_idle_cpus(void) -{ - return (cpus_empty(grq.cpu_idle_map)); -} -#else -static int no_idle_cpus(void) -{ - return 1; -} -#endif - /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -1184,14 +1199,14 @@ unsigned long flags; struct rq *rq; - parent = p->parent; - rq = task_grq_lock(parent, &flags); - if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { + if (p->first_time_slice) { + parent = p->parent; + rq = task_grq_lock(parent, &flags); parent->time_slice += p->time_slice; if (unlikely(parent->time_slice > timeslice())) parent->time_slice = timeslice(); + task_grq_unlock(&flags); } - task_grq_unlock(&flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1402,11 +1417,17 @@ * * externally visible scheduler statistics: current number of runnable * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * number of context switches performed since bootup. All are measured + * without grabbing the grq lock but the occasional inaccurate result + * doesn't matter so long as it's positive. */ unsigned long nr_running(void) { - return grq.nr_running; + long nr = grq.nr_running; + + if (unlikely(nr < 0)) + nr = 0; + return (unsigned long)nr; } unsigned long nr_uninterruptible(void) @@ -1420,7 +1441,12 @@ unsigned long long nr_context_switches(void) { - return grq.nr_switches; + long long ns = grq.nr_switches; + + /* This is of course impossible */ + if (unlikely(ns < 0)) + ns = 1; + return (long long)ns; } unsigned long nr_iowait(void) @@ -1478,9 +1504,7 @@ if (time_before(jiffies, upd)) return; - grq_lock(); active = nr_active(); - grq_unlock(); active = active > 0 ? active * FIXED_1 : 0; avenrun[0] = calc_load(avenrun[0], EXP_1, active); @@ -1504,9 +1528,9 @@ * to just returning jiffies, and for hardware that can't do tsc. */ static void -update_cpu_clock(struct task_struct *p, struct rq *rq, int tick) +update_cpu_clock(struct rq *rq, int tick) { - long time_diff = rq->clock - p->last_ran; + long time_diff = rq->clock - rq->rq_last_ran; if (tick) { /* @@ -1528,10 +1552,10 @@ time_diff = JIFFIES_TO_NS(1) / 2; } /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (p != rq->idle && p->policy != SCHED_FIFO) - p->time_slice -= time_diff / 1000; - p->sched_time += time_diff; - p->last_ran = rq->clock; + if (rq->rq_policy != SCHED_FIFO) + rq->rq_time_slice -= time_diff / 1000; + rq->rq_sched_time += time_diff; + rq->rq_last_ran = rq->clock; } /* @@ -1772,6 +1796,27 @@ #endif /* + * Functions to test for when SCHED_ISO tasks have used their allocated + * quota as real time scheduling and convert them back to SCHED_NORMAL. + * Where possible, the data is tested lockless, to avoid grabbing grq_lock + * because the occasional inaccurate result won't matter. However the + * data is only ever modified under lock. + */ +static void set_iso_refractory(void) +{ + grq_lock(); + grq.iso_refractory = 1; + grq_unlock(); +} + +static void clear_iso_refractory(void) +{ + grq_lock(); + grq.iso_refractory = 0; + grq_unlock(); +} + +/* * Test if SCHED_ISO tasks have run longer than their alloted period as RT * tasks and set the refractory flag if necessary. There is 10% hysteresis * for unsetting the flag. @@ -1780,82 +1825,96 @@ { if (likely(!grq.iso_refractory)) { if (grq.iso_ticks / ISO_PERIOD > sched_iso_cpu) - grq.iso_refractory = 1; + set_iso_refractory(); } else { if (grq.iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) - grq.iso_refractory = 0; + clear_iso_refractory(); } return grq.iso_refractory; } +static void iso_tick(void) +{ + grq_lock(); + grq.iso_ticks += 100; + grq_unlock(); +} + /* No SCHED_ISO task was running so decrease rq->iso_ticks */ static inline void no_iso_tick(void) { - grq.iso_ticks = grq.iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + if (grq.iso_ticks) { + grq_lock(); + grq.iso_ticks = grq.iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + grq_unlock(); + } } -static int task_running_iso(struct task_struct *p) +static int rq_running_iso(struct rq *rq) { - return p->prio == ISO_PRIO; + return rq->rq_prio == ISO_PRIO; } /* This manages tasks that have run out of timeslice during a scheduler_tick */ -static void task_running_tick(struct rq *rq, struct task_struct *p) +static void task_running_tick(struct rq *rq) { + struct task_struct *p; + /* * If a SCHED_ISO task is running we increment the iso_ticks. In * order to prevent SCHED_ISO tasks from causing starvation in the * presence of true RT tasks we account those as iso_ticks as well. */ - if ((rt_task(p) || (iso_task(p) && !grq.iso_refractory))) { + if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { if (grq.iso_ticks <= (ISO_PERIOD * 100) - 100) - grq.iso_ticks += 100; + iso_tick(); } else no_iso_tick(); - if (iso_task(p)) { + if (iso_queue(rq)) { if (unlikely(test_ret_isorefractory(rq))) { - if (task_running_iso(p)) { + if (rq_running_iso(rq)) { /* * SCHED_ISO task is running as RT and limit * has been hit. Force it to reschedule as * SCHED_NORMAL by zeroing its time_slice */ - p->time_slice = 0; + rq->rq_time_slice = 0; } } } /* SCHED_FIFO tasks never run out of timeslice. */ - if (p->time_slice > 0 || p->policy == SCHED_FIFO) + if (rq_idle(rq) || rq->rq_time_slice > 0 || rq->rq_policy == SCHED_FIFO) return; - /* p->time_slice <= 0 */ + /* p->time_slice <= 0. We only modify task_struct under grq lock */ + grq_lock(); + p = rq->curr; if (likely(task_running(p))) { requeue_task(p); set_tsk_need_resched(p); } + grq_unlock(); } /* * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. + * We call it with interrupts disabled. The data modified is all + * local to struct rq so we don't need to grab grq lock. */ void scheduler_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *p; sched_clock_tick(); - time_lock_rq(rq); - p = rq->curr; - update_cpu_clock(p, rq, 1); + update_cpu_clock(rq, 1); + update_rq_clock(rq); if (!rq_idle(rq)) - task_running_tick(rq, p); + task_running_tick(rq); else no_iso_tick(); - grq_unlock(); } notrace unsigned long get_parent_ip(unsigned long addr) @@ -2126,10 +2185,11 @@ dump_stack(); } - grq_lock_irq(); + local_irq_disable(); update_rq_clock(rq); now = rq->clock; + grq_lock(); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -2140,14 +2200,18 @@ switch_count = &prev->nvcsw; } + update_cpu_clock(rq, 0); if (prev != idle) { + /* Update all the information stored on struct rq */ + prev ->time_slice = rq->rq_time_slice; + prev->last_ran = rq->rq_last_ran; + prev->deadline = rq->rq_deadline; check_deadline(prev); return_task(prev, deactivate); } if (likely(grq.nr_running)) { next = earliest_deadline_task(rq, idle); - rq->queued_deadline = next->deadline; } else { next = idle; schedstat_inc(rq, sched_goidle); @@ -2163,11 +2227,14 @@ prefetch(next); prefetch_stack(next); - update_cpu_clock(prev, rq, 0); prev->timestamp = prev->last_ran = now; - rq->queued_prio = next->prio; if (likely(prev != next)) { + rq->rq_time_slice = next->time_slice; + rq->rq_last_ran = next->last_ran; + rq->rq_deadline = next->deadline; + rq->rq_prio = next->prio; + sched_info_switch(prev, next); grq.nr_switches++; next->oncpu = 1; @@ -2271,7 +2338,7 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - return try_to_wake_up(curr->private, mode); + return try_to_wake_up(curr->private, mode, sync); } EXPORT_SYMBOL(default_wake_function); @@ -3662,7 +3729,12 @@ idle->timestamp = idle->last_ran = rq->clock; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ - idle->prio = PRIO_LIMIT; + idle->prio = rq->rq_prio = PRIO_LIMIT; + rq->rq_deadline = idle->deadline; + rq->rq_last_ran = idle->last_ran; + rq->rq_policy = idle->policy; + rq->rq_time_slice = idle->time_slice; + rq->rq_sched_time = idle->sched_time; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -5720,8 +5792,8 @@ struct rq *rq; rq = cpu_rq(i); - rq->queued_deadline = 0; - rq->queued_prio = 0; + rq->rq_deadline = 0; + rq->rq_prio = 0; rq->preempt_next = NULL; rq->cpu = i; #ifdef CONFIG_SMP