Documentation updates. Remove the barriers before grabbing the lock; they were dubious and papering over real issues. Microoptimise grq lock grabbing. Check rq is valid in wait_task_inactive. Uh lots of other stuff I haven't documented yet, all pure win. Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-09-30 21:33:27.488879862 +1000 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-10-01 01:22:16.886111443 +1000 @@ -147,13 +147,24 @@ int rr_interval __read_mostly = 6; */ int sched_iso_cpu __read_mostly = 70; +/* + * The relative length of deadline for each priority(nice) level. + */ int prio_ratios[PRIO_RANGE] __read_mostly; +/* + * The quota handed out to tasks of all priority levels when refilling their + * time_slice. + */ static inline unsigned long timeslice(void) { return MS_TO_US(rr_interval); } +/* + * The global runqueue data that all CPUs work off. All data is protected + * by grq.lock. + */ struct global_rq { spinlock_t lock; unsigned long nr_running; @@ -169,11 +180,12 @@ struct global_rq { #endif }; +/* There can be only one */ static struct global_rq grq; /* * This is the main, per-CPU runqueue data structure. - * All this is protected by the global_rq lock. + * This data should only be modified by the local cpu. */ struct rq { #ifdef CONFIG_SMP @@ -204,6 +216,7 @@ struct rq { #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain *sd; + unsigned long *cpu_locality; struct list_head migration_queue; #endif @@ -272,7 +285,6 @@ struct root_domain { * members (mimicking the global state we have today). */ static struct root_domain def_root_domain; - #endif static inline int cpu_of(struct rq *rq) @@ -308,6 +320,11 @@ static inline int cpu_of(struct rq *rq) # define finish_arch_switch(prev) do { } while (0) #endif +/* + * All common locking functions performed on grq.lock. rq->clock is local to + * the cpu accessing it so it can be modified just with interrupts disabled, + * but looking up task_rq must be done under grq.lock to be safe. + */ inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); @@ -321,7 +338,6 @@ static inline int task_running(struct ta static inline void grq_lock(void) __acquires(grq.lock) { - smp_mb(); spin_lock(&grq.lock); } @@ -334,15 +350,14 @@ static inline void grq_unlock(void) static inline void grq_lock_irq(void) __acquires(grq.lock) { - smp_mb(); spin_lock_irq(&grq.lock); } static inline void time_lock_grq(struct rq *rq) __acquires(grq.lock) { - grq_lock(); update_rq_clock(rq); + grq_lock(); } static inline void grq_unlock_irq(void) @@ -354,8 +369,7 @@ static inline void grq_unlock_irq(void) static inline void grq_lock_irqsave(unsigned long *flags) __acquires(grq.lock) { - local_irq_save(*flags); - grq_lock(); + spin_lock_irqsave(&grq.lock, *flags); } static inline void grq_unlock_irqrestore(unsigned long *flags) @@ -491,14 +505,11 @@ static inline void finish_lock_switch(st #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * A task that is queued will be on the grq run list. + * A task that is queued but not running will be on the grq run list. * A task that is not running or queued will not be on the grq run list. - * A task that is currently running will have ->oncpu set and be queued - * temporarily in its own rq queue. - * A task that is running and no longer queued will be seen only on - * context switch exit. + * A task that is currently running will have ->oncpu set but not on the + * grq run list. */ - static inline int task_queued(struct task_struct *p) { return (!list_empty(&p->run_list)); @@ -618,6 +629,11 @@ static inline void resched_suitable_idle wake_up_idle_cpu(first_cpu(tmp)); } +static inline int +cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) +{ + return rq->cpu_locality[task_rq->cpu] * task_timeslice(p); +} #else /* CONFIG_SMP */ static inline void inc_qnr(void) { @@ -649,6 +665,12 @@ static inline int suitable_idle_cpus(str static inline void resched_suitable_idle(struct task_struct *p) { } + +static inline int +cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) +{ + return 0; +} #endif /* CONFIG_SMP */ /* @@ -904,9 +926,13 @@ unsigned long wait_task_inactive(struct * We do the initial early heuristics without holding * any task-queue locks at all. We'll only try to get * the runqueue lock when things look like they will - * work out! + * work out! In the unlikely event rq is dereferenced + * since we're lockless, grab it again. */ +retry_rq: rq = task_rq(p); + if (unlikely(!rq)) + goto retry_rq; /* * If the task is actively running on another CPU @@ -915,9 +941,9 @@ unsigned long wait_task_inactive(struct * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! - * But we don't care, since this will - * return false if the runqueue has changed and p - * is actually now running somewhere else! + * But we don't care, since this will return false + * if the runqueue has changed and p is actually now + * running somewhere else! */ while (task_running(p) && p == rq->curr) { if (match_state && unlikely(p->state != match_state)) @@ -1012,19 +1038,22 @@ EXPORT_SYMBOL_GPL(kick_process); /* * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the - * basis of earlier deadlines. SCHED_BATCH and SCHED_IDLEPRIO don't preempt, - * they cooperatively multitask. + * basis of earlier deadlines. SCHED_BATCH, ISO and IDLEPRIO don't preempt + * between themselves, they cooperatively multitask. */ static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) { - int preempts = 0; - if (p->prio < rq->rq_prio) - preempts = 1; - else if (p->policy == SCHED_NORMAL && (p->prio == rq->rq_prio && - time_before(p->deadline, rq->rq_deadline))) - preempts = 1; - return preempts; + return 1; + if (p->policy == SCHED_NORMAL) { + unsigned long p_deadline = p->deadline + + cache_distance(task_rq(p), rq, p); + + if ((p->prio == rq->rq_prio && + time_before(p_deadline, rq->rq_deadline))) + return 1; + } + return 0; } /* @@ -1119,6 +1148,9 @@ static int try_to_wake_up(struct task_st int success = 0; struct rq *rq; + /* This barrier is undocumented, probably for p->state? くそ */ + smp_wmb(); + /* * No need to do time_lock_grq as we only need to update the rq clock * if we activate the task @@ -1126,7 +1158,7 @@ static int try_to_wake_up(struct task_st rq = task_grq_lock(p, &flags); /* state is a volatile long, どうして、分からない */ - if (!(unsigned int)p->state & state) + if (!((unsigned int)p->state & state)) goto out_unlock; if (task_queued(p) || task_running(p)) @@ -1273,7 +1305,7 @@ void wake_up_new_task(struct task_struct /* * Potentially available exiting-child timeslices are * retrieved here - this way the parent does not get - * penalized for creating too many threads. + * penalised for creating too many threads. * * (this cannot be used to 'generate' timeslices * artificially, because any timeslice recovered here @@ -1286,11 +1318,19 @@ void sched_exit(struct task_struct *p) struct rq *rq; if (p->first_time_slice) { + int *par_tslice, *p_tslice; + parent = p->parent; rq = task_grq_lock(parent, &flags); - parent->time_slice += p->time_slice; - if (unlikely(parent->time_slice > timeslice())) - parent->time_slice = timeslice(); + par_tslice = &parent->time_slice; + p_tslice = &p->time_slice; + if (p == rq->curr) + p_tslice = &rq->rq_time_slice; + else if (parent == rq->curr) + par_tslice = &rq->rq_time_slice; + *par_tslice += *p_tslice; + if (unlikely(*par_tslice > timeslice())) + *par_tslice = timeslice(); task_grq_unlock(&flags); } } @@ -1637,7 +1677,7 @@ pc_system_time(struct rq *rq, struct tas p->stime_pc += pc; if (p->stime_pc >= 100) { - p->stime_pc -= 100; + p->stime_pc %= 100; p->stime = cputime_add(p->stime, one_jiffy); p->stimescaled = cputime_add(p->stimescaled, one_jiffy_scaled); account_group_system_time(p, one_jiffy); @@ -1672,7 +1712,7 @@ static void pc_user_time(struct rq *rq, p->utime_pc += pc; if (p->utime_pc >= 100) { - p->utime_pc -= 100; + p->utime_pc %= 100; p->utime = cputime_add(p->utime, one_jiffy); p->utimescaled = cputime_add(p->utimescaled, one_jiffy_scaled); account_group_user_time(p, one_jiffy); @@ -1940,20 +1980,17 @@ void account_idle_ticks(unsigned long ti * quota as real time scheduling and convert them back to SCHED_NORMAL. * Where possible, the data is tested lockless, to avoid grabbing grq_lock * because the occasional inaccurate result won't matter. However the - * data is only ever modified under lock. + * tick data is only ever modified under lock. iso_refractory is only simply + * set to 0 or 1 so it's not worth grabbing the lock yet again for that. */ static void set_iso_refractory(void) { - grq_lock(); grq.iso_refractory = 1; - grq_unlock(); } static void clear_iso_refractory(void) { - grq_lock(); grq.iso_refractory = 0; - grq_unlock(); } /* @@ -2133,7 +2170,7 @@ static inline int longest_deadline(void) } /* - * SCHED_IDLEPRIO tasks still have a deadline set, but offset by to nice +19. + * SCHED_IDLEPRIO tasks still have a deadline set, but offset by nice +19. * This allows nice levels to work between IDLEPRIO tasks and gives a * deadline longer than nice +19 for when they're scheduled as SCHED_NORMAL * tasks. @@ -2202,10 +2239,9 @@ retry: * there is no need to initialise earliest_deadline * before. Normalise all old deadlines to now. */ - if (time_before(p->deadline, jiffies)) + dl = p->deadline + cache_distance(task_rq(p), rq, p); + if (time_before(dl, jiffies)) dl = jiffies; - else - dl = p->deadline; if (edt == idle || time_before(dl, earliest_deadline)) { @@ -2278,6 +2314,12 @@ static inline void set_rq_task(struct rq rq->rq_prio = p->prio; } +static void reset_rq_task(struct rq *rq, struct task_struct *p) +{ + rq->rq_policy = p->policy; + rq->rq_prio = p->prio; +} + /* * schedule() is the main scheduler function. */ @@ -2361,7 +2403,7 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + context_switch(rq, prev, next); /* unlocks the grq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. @@ -2522,7 +2564,7 @@ void __wake_up_locked_key(wait_queue_hea * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' + * be migrated to another CPU - ie. the two threads are 'synchronised' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. @@ -2556,7 +2598,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key); * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' + * be migrated to another CPU - ie. the two threads are 'synchronised' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. @@ -2921,8 +2963,10 @@ void set_user_nice(struct task_struct *p } /* Just resched the task, schedule() will know what to do. */ - if (task_running(p)) + if (task_running(p)) { resched_task(p); + reset_rq_task(rq, p); + } out_unlock: task_grq_unlock(&flags); } @@ -3060,8 +3104,10 @@ __setscheduler(struct task_struct *p, st * Reschedule if running. schedule() will know if it can continue * running or not. */ - if (task_running(p)) + if (task_running(p)) { resched_task(p); + reset_rq_task(rq, p); + } } /* @@ -3824,7 +3870,7 @@ void show_state_filter(unsigned long sta * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -3972,7 +4018,7 @@ void wake_up_idle_cpu(int cpu) * This is safe, as this function is called with the timer * wheel base lock of (cpu) held. When the CPU is on the way * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new + * be serialised on the timer wheel base lock and take the new * timer into account automatically. */ if (unlikely(rq->curr != idle)) @@ -4441,7 +4487,7 @@ early_initcall(migration_init); #endif /* - * sched_domains_mutex serializes calls to arch_init_sched_domains, + * sched_domains_mutex serialises calls to arch_init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ static DEFINE_MUTEX(sched_domains_mutex); @@ -5077,7 +5123,7 @@ static void free_sched_groups(const stru #endif /* CONFIG_NUMA */ /* - * Initialize sched groups cpu_power. + * Initialise sched groups cpu_power. * * cpu_power indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. @@ -5129,7 +5175,7 @@ static void init_sched_groups_power(int } /* - * Initializers for schedule domains + * Initialisers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ @@ -5536,7 +5582,7 @@ static struct sched_domain_attr *dattr_c static cpumask_var_t fallback_doms; /* - * arch_update_cpu_topology lets virtualized architectures update the + * arch_update_cpu_topology lets virtualised architectures update the * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ @@ -5827,6 +5873,9 @@ static int update_runtime(struct notifie void __init sched_init_smp(void) { + struct sched_domain *sd; + int cpu; + cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); @@ -5866,6 +5915,30 @@ void __init sched_init_smp(void) * fashion. */ rr_interval *= 1 + ilog2(num_online_cpus()); + + /* + * Set up the relative cache distance of each online cpu from each + * other in a simple array for quick lookup. + */ + for_each_online_cpu(cpu) { + for_each_domain(cpu, sd) { + struct rq *rq = cpu_rq(cpu); + unsigned long locality; + int other_cpu; + + if (sd->level <= SD_LV_MC) + locality = 0; + else if (sd->level <= SD_LV_CPU) + locality = 1; + else + continue; + + for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) { + if (locality < rq->cpu_locality[other_cpu]) + rq->cpu_locality[other_cpu] = locality; + } + } + } } #else void __init sched_init_smp(void) @@ -5882,9 +5955,9 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } -void sched_init(void) +void __init sched_init(void) { - int i; + int i, j; int highest_cpu = 0; prio_ratios[0] = 100; @@ -5925,6 +5998,17 @@ void sched_init(void) #ifdef CONFIG_SMP nr_cpu_ids = highest_cpu + 1; + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + + rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(unsigned long), GFP_NOWAIT); + for_each_possible_cpu(j) { + if (i == j) + rq->cpu_locality[j] = 0; + else + rq->cpu_locality[j] = 4; + } + } #endif #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -6051,7 +6135,7 @@ struct task_struct *curr_task(int cpu) * Description: This function must only be used when non-maskable interrupts * are serviced on a separate stack. It allows the architecture to switch the * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the + * must be called with all CPU's synchronised, and interrupts disabled, the * and caller must save the original value of the current task (see * curr_task() above) and restore that value before reenabling interrupts and * re-starting the system. Index: linux-2.6.31-bfs/kernel/Kconfig.preempt =================================================================== --- linux-2.6.31-bfs.orig/kernel/Kconfig.preempt 2009-09-28 10:09:57.107347673 +1000 +++ linux-2.6.31-bfs/kernel/Kconfig.preempt 2009-09-30 21:35:26.454879235 +1000 @@ -1,7 +1,7 @@ choice prompt "Preemption Model" - default PREEMPT + default PREEMPT_NONE config PREEMPT_NONE bool "No Forced Preemption (Server)" @@ -16,6 +16,23 @@ config PREEMPT_NONE raw processing power of the kernel, irrespective of scheduling latencies. +config PREEMPT_VOLUNTARY + bool "Voluntary Kernel Preemption (Desktop)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new + preemption points have been selected to reduce the maximum + latency of rescheduling, providing faster application reactions, + at the cost of slightly lower throughput. + + This allows reaction to interactive events by allowing a + low priority process to voluntarily preempt itself even if it + is in kernel mode executing a system call. This allows + applications to run more 'smoothly' even when the system is + under load. + + Select this if you are building a kernel for a desktop system. + config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" help Index: linux-2.6.31-bfs/init/main.c =================================================================== --- linux-2.6.31-bfs.orig/init/main.c 2009-09-27 09:33:26.840761727 +1000 +++ linux-2.6.31-bfs/init/main.c 2009-10-01 01:18:30.597861105 +1000 @@ -6,7 +6,7 @@ * GK 2/5/95 - Changed to support mounting root fs via NFS * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 - * Simplified starting of init: Michael A. Griffith + * Simplified starting of init: Michael A. Griffith */ #include @@ -843,6 +843,8 @@ static noinline int init_post(void) system_state = SYSTEM_RUNNING; numa_default_policy(); + printk(KERN_INFO"Running BFS CPU scheduler v0.240-test by Con Kolivas.\n"); + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) printk(KERN_WARNING "Warning: unable to open an initial console.\n");