Documentation updates. Remove the barriers before grabbing the lock; they were dubious and papering over real issues. Microoptimise grq lock grabbing. Check rq is valid in wait_task_inactive. Add cpu_locality to make the most of cache warmth as architectures differ, using a very simple modification to the virtual deadline to improve throughput on multicore, SMP and NUMA. Add the undocumented barrier in try_to_wake_up as is used in mainline. Minor cleanups for code clarity. Change documentation from American to English, but leave function names intact for compatibility. Update sched_exit to ensure we're using the real time_slice if p or parent are the curr task. Make sched_exit actually do something by using it in exit code! Remove locking around setting/unsetting iso_refractory since a race will only mistakenly make it positive which is safe and will be reset soon. reset_rq_task when changing prio or policy on curr task. Change init_idle to avoid section mismatch. Make sched_init __init again to avoid section mismatch. The oops on kvm was a qemu bug. Reallow the PREEMPT_VOLUNTARY config option. Fixing bugs makes it work again. Add bootup notice by popular request. --- include/linux/sched.h | 1 init/main.c | 2 kernel/Kconfig.preempt | 19 ++++ kernel/exit.c | 1 kernel/sched_bfs.c | 207 ++++++++++++++++++++++++++++++++++++------------- 5 files changed, 176 insertions(+), 54 deletions(-) Index: linux-2.6.31-test/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-test.orig/kernel/sched_bfs.c 2009-10-01 12:24:56.538334919 +1000 +++ linux-2.6.31-test/kernel/sched_bfs.c 2009-10-01 12:30:25.539335484 +1000 @@ -147,13 +147,24 @@ int rr_interval __read_mostly = 6; */ int sched_iso_cpu __read_mostly = 70; +/* + * The relative length of deadline for each priority(nice) level. + */ int prio_ratios[PRIO_RANGE] __read_mostly; +/* + * The quota handed out to tasks of all priority levels when refilling their + * time_slice. + */ static inline unsigned long timeslice(void) { return MS_TO_US(rr_interval); } +/* + * The global runqueue data that all CPUs work off. All data is protected + * by grq.lock. + */ struct global_rq { spinlock_t lock; unsigned long nr_running; @@ -169,11 +180,12 @@ struct global_rq { #endif }; +/* There can be only one */ static struct global_rq grq; /* * This is the main, per-CPU runqueue data structure. - * All this is protected by the global_rq lock. + * This data should only be modified by the local cpu. */ struct rq { #ifdef CONFIG_SMP @@ -204,6 +216,7 @@ struct rq { #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain *sd; + unsigned long *cpu_locality; /* CPU relative cache distance */ struct list_head migration_queue; #endif @@ -272,7 +285,6 @@ struct root_domain { * members (mimicking the global state we have today). */ static struct root_domain def_root_domain; - #endif static inline int cpu_of(struct rq *rq) @@ -308,6 +320,11 @@ static inline int cpu_of(struct rq *rq) # define finish_arch_switch(prev) do { } while (0) #endif +/* + * All common locking functions performed on grq.lock. rq->clock is local to + * the cpu accessing it so it can be modified just with interrupts disabled, + * but looking up task_rq must be done under grq.lock to be safe. + */ inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); @@ -321,7 +338,6 @@ static inline int task_running(struct ta static inline void grq_lock(void) __acquires(grq.lock) { - smp_mb(); spin_lock(&grq.lock); } @@ -334,15 +350,14 @@ static inline void grq_unlock(void) static inline void grq_lock_irq(void) __acquires(grq.lock) { - smp_mb(); spin_lock_irq(&grq.lock); } static inline void time_lock_grq(struct rq *rq) __acquires(grq.lock) { - grq_lock(); update_rq_clock(rq); + grq_lock(); } static inline void grq_unlock_irq(void) @@ -354,8 +369,7 @@ static inline void grq_unlock_irq(void) static inline void grq_lock_irqsave(unsigned long *flags) __acquires(grq.lock) { - local_irq_save(*flags); - grq_lock(); + spin_lock_irqsave(&grq.lock, *flags); } static inline void grq_unlock_irqrestore(unsigned long *flags) @@ -491,14 +505,11 @@ static inline void finish_lock_switch(st #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * A task that is queued will be on the grq run list. + * A task that is queued but not running will be on the grq run list. * A task that is not running or queued will not be on the grq run list. - * A task that is currently running will have ->oncpu set and be queued - * temporarily in its own rq queue. - * A task that is running and no longer queued will be seen only on - * context switch exit. + * A task that is currently running will have ->oncpu set but not on the + * grq run list. */ - static inline int task_queued(struct task_struct *p) { return (!list_empty(&p->run_list)); @@ -618,6 +629,19 @@ static inline void resched_suitable_idle wake_up_idle_cpu(first_cpu(tmp)); } +/* + * The cpu cache locality difference between CPUs is used to determine how far + * to offset the virtual deadline. "One" difference in locality means that one + * timeslice difference is allowed longer for the cpu local tasks. This is + * enough in the common case when tasks are up to 2* number of CPUs to keep + * tasks within their shared cache CPUs only. See sched_init_smp for how + * locality is determined. + */ +static inline int +cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) +{ + return rq->cpu_locality[task_rq->cpu] * task_timeslice(p); +} #else /* CONFIG_SMP */ static inline void inc_qnr(void) { @@ -649,6 +673,12 @@ static inline int suitable_idle_cpus(str static inline void resched_suitable_idle(struct task_struct *p) { } + +static inline int +cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) +{ + return 0; +} #endif /* CONFIG_SMP */ /* @@ -904,9 +934,13 @@ unsigned long wait_task_inactive(struct * We do the initial early heuristics without holding * any task-queue locks at all. We'll only try to get * the runqueue lock when things look like they will - * work out! + * work out! In the unlikely event rq is dereferenced + * since we're lockless, grab it again. */ +retry_rq: rq = task_rq(p); + if (unlikely(!rq)) + goto retry_rq; /* * If the task is actively running on another CPU @@ -915,9 +949,9 @@ unsigned long wait_task_inactive(struct * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! - * But we don't care, since this will - * return false if the runqueue has changed and p - * is actually now running somewhere else! + * But we don't care, since this will return false + * if the runqueue has changed and p is actually now + * running somewhere else! */ while (task_running(p) && p == rq->curr) { if (match_state && unlikely(p->state != match_state)) @@ -1012,19 +1046,22 @@ EXPORT_SYMBOL_GPL(kick_process); /* * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the - * basis of earlier deadlines. SCHED_BATCH and SCHED_IDLEPRIO don't preempt, - * they cooperatively multitask. + * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt + * between themselves, they cooperatively multitask. */ static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) { - int preempts = 0; - if (p->prio < rq->rq_prio) - preempts = 1; - else if (p->policy == SCHED_NORMAL && (p->prio == rq->rq_prio && - time_before(p->deadline, rq->rq_deadline))) - preempts = 1; - return preempts; + return 1; + if (p->policy == SCHED_NORMAL) { + unsigned long p_deadline = p->deadline + + cache_distance(task_rq(p), rq, p); + + if ((p->prio == rq->rq_prio && + time_before(p_deadline, rq->rq_deadline))) + return 1; + } + return 0; } /* @@ -1119,6 +1156,9 @@ static int try_to_wake_up(struct task_st int success = 0; struct rq *rq; + /* This barrier is undocumented, probably for p->state? くそ */ + smp_wmb(); + /* * No need to do time_lock_grq as we only need to update the rq clock * if we activate the task @@ -1126,7 +1166,7 @@ static int try_to_wake_up(struct task_st rq = task_grq_lock(p, &flags); /* state is a volatile long, どうして、分からない */ - if (!(unsigned int)p->state & state) + if (!((unsigned int)p->state & state)) goto out_unlock; if (task_queued(p) || task_running(p)) @@ -1273,7 +1313,7 @@ void wake_up_new_task(struct task_struct /* * Potentially available exiting-child timeslices are * retrieved here - this way the parent does not get - * penalized for creating too many threads. + * penalised for creating too many threads. * * (this cannot be used to 'generate' timeslices * artificially, because any timeslice recovered here @@ -1286,11 +1326,22 @@ void sched_exit(struct task_struct *p) struct rq *rq; if (p->first_time_slice) { + int *par_tslice, *p_tslice; + parent = p->parent; rq = task_grq_lock(parent, &flags); - parent->time_slice += p->time_slice; - if (unlikely(parent->time_slice > timeslice())) - parent->time_slice = timeslice(); + par_tslice = &parent->time_slice; + p_tslice = &p->time_slice; + + /* The real time_slice of the "curr" task is on the rq var.*/ + if (p == rq->curr) + p_tslice = &rq->rq_time_slice; + else if (parent == task_rq(parent)->curr) + par_tslice = &rq->rq_time_slice; + + *par_tslice += *p_tslice; + if (unlikely(*par_tslice > timeslice())) + *par_tslice = timeslice(); task_grq_unlock(&flags); } } @@ -1940,20 +1991,17 @@ void account_idle_ticks(unsigned long ti * quota as real time scheduling and convert them back to SCHED_NORMAL. * Where possible, the data is tested lockless, to avoid grabbing grq_lock * because the occasional inaccurate result won't matter. However the - * data is only ever modified under lock. + * tick data is only ever modified under lock. iso_refractory is only simply + * set to 0 or 1 so it's not worth grabbing the lock yet again for that. */ static void set_iso_refractory(void) { - grq_lock(); grq.iso_refractory = 1; - grq_unlock(); } static void clear_iso_refractory(void) { - grq_lock(); grq.iso_refractory = 0; - grq_unlock(); } /* @@ -2133,7 +2181,7 @@ static inline int longest_deadline(void) } /* - * SCHED_IDLEPRIO tasks still have a deadline set, but offset by to nice +19. + * SCHED_IDLEPRIO tasks still have a deadline set, but offset by nice +19. * This allows nice levels to work between IDLEPRIO tasks and gives a * deadline longer than nice +19 for when they're scheduled as SCHED_NORMAL * tasks. @@ -2202,10 +2250,9 @@ retry: * there is no need to initialise earliest_deadline * before. Normalise all old deadlines to now. */ - if (time_before(p->deadline, jiffies)) + dl = p->deadline + cache_distance(task_rq(p), rq, p); + if (time_before(dl, jiffies)) dl = jiffies; - else - dl = p->deadline; if (edt == idle || time_before(dl, earliest_deadline)) { @@ -2278,6 +2325,12 @@ static inline void set_rq_task(struct rq rq->rq_prio = p->prio; } +static void reset_rq_task(struct rq *rq, struct task_struct *p) +{ + rq->rq_policy = p->policy; + rq->rq_prio = p->prio; +} + /* * schedule() is the main scheduler function. */ @@ -2361,7 +2414,7 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + context_switch(rq, prev, next); /* unlocks the grq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. @@ -2522,7 +2575,7 @@ void __wake_up_locked_key(wait_queue_hea * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' + * be migrated to another CPU - ie. the two threads are 'synchronised' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. @@ -2556,7 +2609,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key); * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' + * be migrated to another CPU - ie. the two threads are 'synchronised' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. @@ -2921,8 +2974,10 @@ void set_user_nice(struct task_struct *p } /* Just resched the task, schedule() will know what to do. */ - if (task_running(p)) + if (task_running(p)) { resched_task(p); + reset_rq_task(rq, p); + } out_unlock: task_grq_unlock(&flags); } @@ -3060,8 +3115,10 @@ __setscheduler(struct task_struct *p, st * Reschedule if running. schedule() will know if it can continue * running or not. */ - if (task_running(p)) + if (task_running(p)) { resched_task(p); + reset_rq_task(rq, p); + } } /* @@ -3824,7 +3881,7 @@ void show_state_filter(unsigned long sta * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -3972,7 +4029,7 @@ void wake_up_idle_cpu(int cpu) * This is safe, as this function is called with the timer * wheel base lock of (cpu) held. When the CPU is on the way * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new + * be serialised on the timer wheel base lock and take the new * timer into account automatically. */ if (unlikely(rq->curr != idle)) @@ -4441,7 +4498,7 @@ early_initcall(migration_init); #endif /* - * sched_domains_mutex serializes calls to arch_init_sched_domains, + * sched_domains_mutex serialises calls to arch_init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ static DEFINE_MUTEX(sched_domains_mutex); @@ -5077,7 +5134,7 @@ static void free_sched_groups(const stru #endif /* CONFIG_NUMA */ /* - * Initialize sched groups cpu_power. + * Initialise sched groups cpu_power. * * cpu_power indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. @@ -5129,7 +5186,7 @@ static void init_sched_groups_power(int } /* - * Initializers for schedule domains + * Initialisers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ @@ -5536,7 +5593,7 @@ static struct sched_domain_attr *dattr_c static cpumask_var_t fallback_doms; /* - * arch_update_cpu_topology lets virtualized architectures update the + * arch_update_cpu_topology lets virtualised architectures update the * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ @@ -5827,6 +5884,9 @@ static int update_runtime(struct notifie void __init sched_init_smp(void) { + struct sched_domain *sd; + int cpu; + cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); @@ -5866,6 +5926,35 @@ void __init sched_init_smp(void) * fashion. */ rr_interval *= 1 + ilog2(num_online_cpus()); + + /* + * Set up the relative cache distance of each online cpu from each + * other in a simple array for quick lookup. Locality is determined + * by the closest sched_domain that CPUs are separated by. CPUs with + * shared cache in SMT and MC are treated as local. Separate CPUs + * (within the same package or physically) within the same node are + * treated as not local. CPUs not even in the same domain (different + * nodes) are treated as very distant. + */ + for_each_online_cpu(cpu) { + for_each_domain(cpu, sd) { + struct rq *rq = cpu_rq(cpu); + unsigned long locality; + int other_cpu; + + if (sd->level <= SD_LV_MC) + locality = 0; + else if (sd->level <= SD_LV_NODE) + locality = 1; + else + continue; + + for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) { + if (locality < rq->cpu_locality[other_cpu]) + rq->cpu_locality[other_cpu] = locality; + } + } + } } #else void __init sched_init_smp(void) @@ -5882,7 +5971,7 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } -void sched_init(void) +void __init sched_init(void) { int i; int highest_cpu = 0; @@ -5925,6 +6014,18 @@ void sched_init(void) #ifdef CONFIG_SMP nr_cpu_ids = highest_cpu + 1; + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + int j; + + rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(unsigned long), GFP_NOWAIT); + for_each_possible_cpu(j) { + if (i == j) + rq->cpu_locality[j] = 0; + else + rq->cpu_locality[j] = 4; + } + } #endif #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -6051,7 +6152,7 @@ struct task_struct *curr_task(int cpu) * Description: This function must only be used when non-maskable interrupts * are serviced on a separate stack. It allows the architecture to switch the * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the + * must be called with all CPU's synchronised, and interrupts disabled, the * and caller must save the original value of the current task (see * curr_task() above) and restore that value before reenabling interrupts and * re-starting the system. Index: linux-2.6.31-test/kernel/Kconfig.preempt =================================================================== --- linux-2.6.31-test.orig/kernel/Kconfig.preempt 2009-10-01 12:24:56.552354234 +1000 +++ linux-2.6.31-test/kernel/Kconfig.preempt 2009-10-01 12:30:25.539335484 +1000 @@ -1,7 +1,7 @@ choice prompt "Preemption Model" - default PREEMPT + default PREEMPT_NONE config PREEMPT_NONE bool "No Forced Preemption (Server)" @@ -16,6 +16,23 @@ config PREEMPT_NONE raw processing power of the kernel, irrespective of scheduling latencies. +config PREEMPT_VOLUNTARY + bool "Voluntary Kernel Preemption (Desktop)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new + preemption points have been selected to reduce the maximum + latency of rescheduling, providing faster application reactions, + at the cost of slightly lower throughput. + + This allows reaction to interactive events by allowing a + low priority process to voluntarily preempt itself even if it + is in kernel mode executing a system call. This allows + applications to run more 'smoothly' even when the system is + under load. + + Select this if you are building a kernel for a desktop system. + config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" help Index: linux-2.6.31-test/init/main.c =================================================================== --- linux-2.6.31-test.orig/init/main.c 2009-09-10 11:45:38.000000000 +1000 +++ linux-2.6.31-test/init/main.c 2009-10-01 12:30:25.539335484 +1000 @@ -843,6 +843,8 @@ static noinline int init_post(void) system_state = SYSTEM_RUNNING; numa_default_policy(); + printk(KERN_INFO"Running BFS CPU scheduler v0.300 by Con Kolivas.\n"); + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) printk(KERN_WARNING "Warning: unable to open an initial console.\n"); Index: linux-2.6.31-test/kernel/exit.c =================================================================== --- linux-2.6.31-test.orig/kernel/exit.c 2009-10-01 12:24:56.541364845 +1000 +++ linux-2.6.31-test/kernel/exit.c 2009-10-01 12:30:25.541335390 +1000 @@ -206,6 +206,7 @@ repeat: leader->exit_state = EXIT_DEAD; } + sched_exit(p); write_unlock_irq(&tasklist_lock); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); Index: linux-2.6.31-test/include/linux/sched.h =================================================================== --- linux-2.6.31-test.orig/include/linux/sched.h 2009-10-01 12:24:56.486614782 +1000 +++ linux-2.6.31-test/include/linux/sched.h 2009-10-01 12:30:25.543335645 +1000 @@ -1795,6 +1795,7 @@ extern void wake_up_new_task(struct task static inline void kick_process(struct task_struct *tsk) { } #endif extern void sched_fork(struct task_struct *p, int clone_flags); +extern void sched_exit(struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void);