Documentation updates. Cleanup of used-once variables. time_diff bogus values are rare, make them unlikely() Only resched when changing the policy settings if the task lowers its effective priority/policy. sched_yield broke when we moved accounting into the local rq data, fix it. Add extensive documentation in Documentation/scheduler. _sid_'s changes: Change iso_ variables to int. Move rq->cpu into SMP only code and use cpu_of() to speed up UP. Get rid of unnecessary double negate in task_running. Make UP version of try_preempt function. thanks _sid_ ! --- Documentation/scheduler/sched-BFS.txt | 335 ++++++++++++++++++++++++++++++++++ init/main.c | 2 kernel/sched_bfs.c | 127 ++++++++---- 3 files changed, 420 insertions(+), 44 deletions(-) Index: linux-2.6.31-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.31-bfs.orig/kernel/sched_bfs.c 2009-10-15 22:16:31.405401024 +1100 +++ linux-2.6.31-bfs/kernel/sched_bfs.c 2009-10-16 21:05:20.971775567 +1100 @@ -172,8 +172,8 @@ struct global_rq { unsigned long long nr_switches; struct list_head queue[PRIO_LIMIT]; DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); - unsigned long iso_ticks; - unsigned short iso_refractory; + int iso_ticks; + int iso_refractory; #ifdef CONFIG_SMP unsigned long qnr; /* queued not running */ cpumask_t cpu_idle_map; @@ -210,9 +210,8 @@ struct rq { iowait_pc, idle_pc; atomic_t nr_iowait; - int cpu; /* cpu of this runqueue */ - #ifdef CONFIG_SMP + int cpu; /* cpu of this runqueue */ int online; struct root_domain *rd; @@ -339,7 +338,7 @@ inline void update_rq_clock(struct rq *r static inline int task_running(struct task_struct *p) { - return (!!p->oncpu); + return p->oncpu; } static inline void grq_lock(void) @@ -529,18 +528,31 @@ static void dequeue_task(struct task_str __clear_bit(p->prio, grq.prio_bitmap); } +/* + * When a task is freshly forked, the first_time_slice flag is set to say + * it has taken time_slice from its parent and if it exits on this first + * time_slice it can return its time_slice back to the parent. + */ static inline void reset_first_time_slice(struct task_struct *p) { if (unlikely(p->first_time_slice)) p->first_time_slice = 0; } +/* + * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as + * an idle task, we ensure none of the following conditions are met. + */ static int idleprio_suitable(struct task_struct *p) { return (!freezing(p) && !signal_pending(p) && !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); } +/* + * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check + * that the iso_refractory flag is not set. + */ static int isoprio_suitable(void) { return !grq.iso_refractory; @@ -577,6 +589,10 @@ static inline void requeue_task(struct t sched_info_queued(p); } +/* + * Returns the relative length of deadline all compared to the shortest + * deadline which is that of nice -20. + */ static inline int task_prio_ratio(struct task_struct *p) { return prio_ratios[TASK_USER_PRIO(p)]; @@ -593,6 +609,11 @@ static inline int task_timeslice(struct } #ifdef CONFIG_SMP +/* + * qnr is the "queued but not running" count which is the total number of + * tasks on the global runqueue list waiting for cpu time but not actually + * currently running on a cpu. + */ static inline void inc_qnr(void) { grq.qnr++; @@ -608,6 +629,10 @@ static inline int queued_notrunning(void return grq.qnr; } +/* + * The cpu_idle_map stores a bitmap of all the cpus currently idle to + * allow easy lookup of whether any suitable idle cpus are available. + */ static inline void set_cpuidle_map(unsigned long cpu) { cpu_set(cpu, grq.cpu_idle_map); @@ -641,13 +666,14 @@ static inline void resched_suitable_idle * tasks within their shared cache CPUs only. CPUs on different nodes or not * even in this domain (NUMA) have "3" difference, allowing 4 times longer * deadlines before being taken onto another cpu, allowing for 2* the double - * seen by separate CPUs above. See sched_init_smp for how locality is - * determined. + * seen by separate CPUs above. + * Simple summary: Virtual deadlines are equal on shared cache CPUs, double + * on separate CPUs and quadruple in separate NUMA nodes. */ static inline int cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) { - return rq->cpu_locality[task_rq->cpu] * task_timeslice(p); + return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p); } #else /* CONFIG_SMP */ static inline void inc_qnr(void) @@ -729,15 +755,11 @@ static int effective_prio(struct task_st } /* - * activate_task - move a task to the runqueue. Enter with grq locked. The rq - * doesn't really matter but gives us the local clock. + * activate_task - move a task to the runqueue. Enter with grq locked. */ static void activate_task(struct task_struct *p, struct rq *rq) { - u64 now; - update_rq_clock(rq); - now = rq->clock; /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -747,7 +769,7 @@ static void activate_task(struct task_st if (unlikely(prof_on == SLEEP_PROFILING)) { if (p->state == TASK_UNINTERRUPTIBLE) profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (now - p->last_ran) >> 20); + (rq->clock - p->last_ran) >> 20); } p->prio = effective_prio(p); @@ -760,7 +782,7 @@ static void activate_task(struct task_st /* * deactivate_task - If it's running, it's not on the grq and we can just - * decrement the nr_running. + * decrement the nr_running. Enter with grq locked. */ static inline void deactivate_task(struct task_struct *p) { @@ -790,7 +812,7 @@ void set_task_cpu(struct task_struct *p, */ static inline void take_task(struct rq *rq, struct task_struct *p) { - set_task_cpu(p, rq->cpu); + set_task_cpu(p, cpu_of(rq)); dequeue_task(p); dec_qnr(); } @@ -1063,6 +1085,7 @@ EXPORT_SYMBOL_GPL(kick_process); * and highest_prio_rq are initialised only to silence the compiler. When * all else is equal, still prefer this_rq. */ +#ifdef CONFIG_SMP static void try_preempt(struct task_struct *p, struct rq *this_rq) { unsigned long latest_deadline = 0, cpu; @@ -1110,6 +1133,16 @@ out: resched_task(highest_prio_rq->curr); return; } +#else /* CONFIG_SMP */ +static void try_preempt(struct task_struct *p, struct rq *this_rq) +{ + if (p->prio < this_rq->rq_prio || + (p->prio == this_rq->rq_prio && p->policy == SCHED_NORMAL && + time_before(p->deadline, this_rq->rq_deadline))) + resched_task(this_rq->curr); + return; +} +#endif /* CONFIG_SMP */ /** * task_oncpu_function_call - call a function on the cpu on which a task runs @@ -1795,9 +1828,9 @@ update_cpu_clock(struct rq *rq, struct t * negative/overflow. time_diff is only used for internal scheduler * time_slice accounting. */ - if (time_diff <= 0) + if (unlikely(time_diff <= 0)) time_diff = JIFFIES_TO_NS(1) / 2; - else if (time_diff > JIFFIES_TO_NS(1)) + else if (unlikely(time_diff > JIFFIES_TO_NS(1))) time_diff = JIFFIES_TO_NS(1); rq->rq_time_slice -= time_diff / 1000; @@ -1818,7 +1851,7 @@ static u64 do_task_delta_exec(struct tas if (p == rq->curr) { update_rq_clock(rq); ns = rq->clock - rq->rq_last_ran; - if ((s64)ns < 0) + if (unlikely((s64)ns < 0)) ns = 0; } @@ -2171,8 +2204,8 @@ EXPORT_SYMBOL(sub_preempt_count); * same nice value, it proportions cpu according to nice level, it means the * task that last woke up the longest ago has the earliest deadline, thus * ensuring that interactive tasks get low latency on wake up. The CPU - * proportion works out to the square of the difference, so this equation will - * give nice 19 3% CPU compared to nice 0 and nice 0 3% compared to nice -20. + * proportion works out to the square of the virtual deadline difference, so + * this equation will give nice 19 3% CPU compared to nice 0. */ static inline int prio_deadline_diff(int user_prio) { @@ -2232,13 +2265,18 @@ static inline void check_deadline(struct * earliest deadline. * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are * selected by the earliest deadline. + * Once deadlines are expired (jiffies has passed it) tasks are chosen in FIFO + * order. Note that very few tasks will be FIFO for very long because they + * only end up that way if they sleep for long or if if there are enough fully + * cpu bound tasks to push the load to ~8 higher than the number of CPUs for + * nice 0. */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, struct task_struct *idle) { unsigned long dl, earliest_deadline = 0; /* Initialise to silence compiler */ struct task_struct *p, *edt; - unsigned int cpu = rq->cpu; + unsigned int cpu = cpu_of(rq); struct list_head *queue; int idx = 0; @@ -2365,7 +2403,6 @@ asmlinkage void __sched schedule(void) unsigned long *switch_count; int deactivate, cpu; struct rq *rq; - u64 now; need_resched: preempt_disable(); @@ -2385,7 +2422,6 @@ need_resched_nonpreemptible: local_irq_disable(); update_rq_clock(rq); - now = rq->clock; update_cpu_clock(rq, prev, 0); grq_lock(); @@ -2426,7 +2462,7 @@ need_resched_nonpreemptible: else clear_cpuidle_map(cpu); - prev->last_ran = now; + prev->last_ran = rq->clock; if (likely(prev != next)) { sched_info_switch(prev, next); @@ -2596,7 +2632,7 @@ static void __wake_up_common(wait_queue_ list_for_each_safe(tmp, next, &q->task_list) { wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); - unsigned flags = curr->flags; + unsigned int flags = curr->flags; if (curr->func(curr, mode, sync, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) @@ -3171,20 +3207,22 @@ static inline struct task_struct *find_p static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) { + int oldrtprio, oldprio; + BUG_ON(task_queued(p)); p->policy = policy; + oldrtprio = p->rt_priority; p->rt_priority = prio; p->normal_prio = normal_prio(p); + oldprio = p->prio; /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - /* - * Reschedule if running. schedule() will know if it can continue - * running or not. - */ if (task_running(p)) { - resched_task(p); reset_rq_task(rq, p); + /* Resched only if we might now be preempted */ + if (p->prio > oldprio || p->rt_priority > oldrtprio) + resched_task(p); } } @@ -3642,16 +3680,18 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t * sys_sched_yield - yield the current processor to other threads. * * This function yields the current CPU to other tasks. It does this by - * refilling the timeslice, resetting the deadline and scheduling away. + * zeroing the rq timeslice, which will reset the deadline, and then + * scheduling away. */ SYSCALL_DEFINE0(sched_yield) { struct task_struct *p; + struct rq *rq; p = current; - time_task_grq_lock_irq(p); - schedstat_inc(this_rq(), yld_count); - time_slice_expired(p); + rq = task_grq_lock_irq(p); + schedstat_inc(rq, yld_count); + rq->rq_time_slice = 0; requeue_task(p); /* @@ -4391,7 +4431,7 @@ static void unregister_sched_domain_sysc static void set_rq_online(struct rq *rq) { if (!rq->online) { - cpumask_set_cpu(rq->cpu, rq->rd->online); + cpumask_set_cpu(cpu_of(rq), rq->rd->online); rq->online = 1; } } @@ -4399,7 +4439,7 @@ static void set_rq_online(struct rq *rq) static void set_rq_offline(struct rq *rq) { if (rq->online) { - cpumask_clear_cpu(rq->cpu, rq->rd->online); + cpumask_clear_cpu(cpu_of(rq), rq->rd->online); rq->online = 0; } } @@ -4763,10 +4803,10 @@ static void rq_attach_root(struct rq *rq if (rq->rd) { old_rd = rq->rd; - if (cpumask_test_cpu(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(cpu_of(rq), old_rd->online)) set_rq_offline(rq); - cpumask_clear_cpu(rq->cpu, old_rd->span); + cpumask_clear_cpu(cpu_of(rq), old_rd->span); /* * If we dont want to free the old_rt yet then @@ -4780,8 +4820,8 @@ static void rq_attach_root(struct rq *rq atomic_inc(&rd->refcount); rq->rd = rd; - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + cpumask_set_cpu(cpu_of(rq), rd->span); + if (cpumask_test_cpu(cpu_of(rq), cpu_online_mask)) set_rq_online(rq); grq_unlock_irqrestore(&flags); @@ -6057,13 +6097,13 @@ void __init sched_init(void) #endif for_each_possible_cpu(i) { rq = cpu_rq(i); - rq->cpu = i; rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = rq->iowait_pc = rq->idle_pc = 0; #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; rq->online = 0; + rq->cpu = i; rq_attach_root(rq, &def_root_domain); #endif atomic_set(&rq->nr_iowait, 0); @@ -6079,7 +6119,8 @@ void __init sched_init(void) int j; rq = cpu_rq(i); - rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(unsigned long), GFP_NOWAIT); + rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(unsigned long), + GFP_NOWAIT); for_each_possible_cpu(j) { if (i == j) rq->cpu_locality[j] = 0; Index: linux-2.6.31-bfs/init/main.c =================================================================== --- linux-2.6.31-bfs.orig/init/main.c 2009-10-15 22:16:31.410400998 +1100 +++ linux-2.6.31-bfs/init/main.c 2009-10-16 21:17:24.421776188 +1100 @@ -843,7 +843,7 @@ static noinline int init_post(void) system_state = SYSTEM_RUNNING; numa_default_policy(); - printk(KERN_INFO"Running BFS CPU scheduler v0.303 by Con Kolivas.\n"); + printk(KERN_INFO"BFS CPU scheduler v0.304 by Con Kolivas.\n"); if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) printk(KERN_WARNING "Warning: unable to open an initial console.\n"); Index: linux-2.6.31-bfs/Documentation/scheduler/sched-BFS.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.31-bfs/Documentation/scheduler/sched-BFS.txt 2009-10-15 22:16:43.362737359 +1100 @@ -0,0 +1,335 @@ +BFS - The Brain Fuck Scheduler by Con Kolivas. + +Goals. + +The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to +completely do away with the complex designs of the past for the cpu process +scheduler and instead implement one that is very simple in basic design. +The main focus of BFS is to achieve excellent desktop interactivity and +responsiveness without heuristics and tuning knobs that are difficult to +understand, impossible to model and predict the effect of, and when tuned to +one workload cause massive detriment to another. + + +Design summary. + +BFS is best described as a single runqueue, O(n) lookup, earliest effective +virtual deadline first design, loosely based on EEVDF (earliest eligible virtual +deadline first) and my previous Staircase Deadline scheduler. Each component +shall be described in order to understand the significance of, and reasoning for +it. The codebase when the first stable version was released was approximately +9000 lines less code than the existing mainline linux kernel scheduler (in +2.6.31). This does not even take into account the removal of documentation and +the cgroups code that is not used. + +Design reasoning. + +The single runqueue refers to the queued but not running processes for the +entire system, regardless of the number of CPUs. The reason for going back to +a single runqueue design is that once multiple runqueues are introduced, +per-CPU or otherwise, there will be complex interactions as each runqueue will +be responsible for the scheduling latency and fairness of the tasks only on its +own runqueue, and to achieve fairness and low latency across multiple CPUs, any +advantage in throughput of having CPU local tasks causes other disadvantages. +This is due to requiring a very complex balancing system to at best achieve some +semblance of fairness across CPUs and can only maintain relatively low latency +for tasks bound to the same CPUs, not across them. To increase said fairness +and latency across CPUs, the advantage of local runqueue locking, which makes +for better scalability, is lost due to having to grab multiple locks. + +A significant feature of BFS is that all accounting is done purely based on CPU +used and nowhere is sleep time used in any way to determine entitlement or +interactivity. Interactivity "estimators" that use some kind of sleep/run +algorithm are doomed to fail to detect all interactive tasks, and to falsely tag +tasks that aren't interactive as being so. The reason for this is that it is +close to impossible to determine that when a task is sleeping, whether it is +doing it voluntarily, as in a userspace application waiting for input in the +form of a mouse click or otherwise, or involuntarily, because it is waiting for +another thread, process, I/O, kernel activity or whatever. Thus, such an +estimator will introduce corner cases, and more heuristics will be required to +cope with those corner cases, introducing more corner cases and failed +interactivity detection and so on. Interactivity in BFS is built into the design +by virtue of the fact that tasks that are waking up have not used up their quota +of CPU time, and have earlier effective deadlines, thereby making it very likely +they will preempt any CPU bound task of equivalent nice level. See below for +more information on the virtual deadline mechanism. Even if they do not preempt +a running task, because the rr interval is guaranteed to have a bound upper +limit on how long a task will wait for, it will be scheduled within a timeframe +that will not cause visible interface jitter. + + +Design details. + +Task insertion. + +BFS inserts tasks into each relevant queue as an O(1) insertion into a double +linked list. On insertion, *every* running queue is checked to see if the newly +queued task can run on any idle queue, or preempt the lowest running task on the +system. This is how the cross-CPU scheduling of BFS achieves significantly lower +latency per extra CPU the system has. In this case the lookup is, in the worst +case scenario, O(n) where n is the number of CPUs on the system. + +Data protection. + +BFS has one single lock protecting the process local data of every task in the +global queue. Thus every insertion, removal and modification of task data in the +global runqueue needs to grab the global lock. However, once a task is taken by +a CPU, the CPU has its own local data copy of the running process' accounting +information which only that CPU accesses and modifies (such as during a +timer tick) thus allowing the accounting data to be updated lockless. Once a +CPU has taken a task to run, it removes it from the global queue. Thus the +global queue only ever has, at most, + + (number of tasks requesting cpu time) - (number of logical CPUs) + 1 + +tasks in the global queue. This value is relevant for the time taken to look up +tasks during scheduling. This will increase if many tasks with CPU affinity set +in their policy to limit which CPUs they're allowed to run on if they outnumber +the number of CPUs. The +1 is because when rescheduling a task, the CPU's +currently running task is put back on the queue. Lookup will be described after +the virtual deadline mechanism is explained. + +Virtual deadline. + +The key to achieving low latency, scheduling fairness, and "nice level" +distribution in BFS is entirely in the virtual deadline mechanism. The one +tunable in BFS is the rr_interval, or "round robin interval". This is the +maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) +tasks of the same nice level will be running for, or looking at it the other +way around, the longest duration two tasks of the same nice level will be +delayed for. When a task requests cpu time, it is given a quota (time_slice) +equal to the rr_interval and a virtual deadline. The virtual deadline is +offset from the current time in jiffies by this equation: + + jiffies + (prio_ratio * rr_interval) + +The prio_ratio is determined as a ratio compared to the baseline of nice -20 +and increases by 10% per nice level. The deadline is a virtual one only in that +no guarantee is placed that a task will actually be scheduled by this time, but +it is used to compare which task should go next. There are three components to +how a task is next chosen. First is time_slice expiration. If a task runs out +of its time_slice, it is descheduled, the time_slice is refilled, and the +deadline reset to that formula above. Second is sleep, where a task no longer +is requesting CPU for whatever reason. The time_slice and deadline are _not_ +adjusted in this case and are just carried over for when the task is next +scheduled. Third is preemption, and that is when a newly waking task is deemed +higher priority than a currently running task on any cpu by virtue of the fact +that it has an earlier virtual deadline than the currently running task. The +earlier deadline is the key to which task is next chosen for the first and +second cases. Once a task is descheduled, it is put back on the queue, and an +O(n) lookup of all queued-but-not-running tasks is done to determine which has +the earliest deadline and that task is chosen to receive CPU next. The one +caveat to this is that if a deadline has already passed (jiffies is greater +than the deadline), the tasks are chosen in FIFO (first in first out) order as +the deadlines are old and their absolute value becomes decreasingly relevant +apart from being a flag that they have been asleep and deserve CPU time ahead +of all later deadlines. + +The CPU proportion of different nice tasks works out to be approximately the + + (prio_ratio difference)^2 + +The reason it is squared is that a task's deadline does not change while it is +running unless it runs out of time_slice. Thus, even if the time actually +passes the deadline of another task that is queued, it will not get CPU time +unless the current running task deschedules, and the time "base" (jiffies) is +constantly moving. + +Task lookup. + +BFS has 103 priority queues. 100 of these are dedicated to the static priority +of realtime tasks, and the remaining 3 are, in order of best to worst priority, +SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority +scheduling). When a task of these priorities is queued, a bitmap of running +priorities is set showing which of these priorities has tasks waiting for CPU +time. When a CPU is made to reschedule, the lookup for the next task to get +CPU time is performed in the following way: + +First the bitmap is checked to see what static priority tasks are queued. If +any realtime priorities are found, the corresponding queue is checked and the +first task listed there is taken (provided CPU affinity is suitable) and lookup +is complete. If the priority corresponds to a SCHED_ISO task, they are also +taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds +to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this +stage, every task in the runlist that corresponds to that priority is checked +to see which has the earliest set deadline, and (provided it has suitable CPU +affinity) it is taken off the runqueue and given the CPU. If a task has an +expired deadline, it is taken and the rest of the lookup aborted (as they are +chosen in FIFO order). + +Thus, the lookup is O(n) in the worst case only, where n is as described +earlier, as tasks may be chosen before the whole task list is looked over. + + +Scalability. + +The major limitations of BFS will be that of scalability, as the separate +runqueue designs will have less lock contention as the number of CPUs rises. +However they do not scale linearly even with separate runqueues as multiple +runqueues will need to be locked concurrently on such designs to be able to +achieve fair CPU balancing, to try and achieve some sort of nice-level fairness +across CPUs, and to achieve low enough latency for tasks on a busy CPU when +other CPUs would be more suited. BFS has the advantage that it requires no +balancing algorithm whatsoever, as balancing occurs by proxy simply because +all CPUs draw off the global runqueue, in priority and deadline order. Despite +the fact that scalability is _not_ the prime concern of BFS, it both shows very +good scalability to smaller numbers of CPUs and is likely a more scalable design +at these numbers of CPUs. + +It also has some very low overhead scalability features built into the design +when it has been deemed their overhead is so marginal that they're worth adding. +The first is the local copy of the running process' data to the CPU it's running +on to allow that data to be updated lockless where possible. Then there is +deference paid to the last CPU a task was running on, by trying that CPU first +when looking for an idle CPU to use the next time it's scheduled. Finally there +is the notion of cache locality beyond the last running CPU. The sched_domains +information is used to determine the relative virtual "cache distance" that +other CPUs have from the last CPU a task was running on. CPUs with shared +caches, such as SMT siblings, or multicore CPUs with shared caches, are treated +as cache local. CPUs without shared caches are treated as not cache local, and +CPUs on different NUMA nodes are treated as very distant. This "relative cache +distance" is used by modifying the virtual deadline value when doing lookups. +Effectively, the deadline is unaltered between "cache local" CPUs, doubled for +"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning +behind the doubling of deadlines is as follows. The real cost of migrating a +task from one CPU to another is entirely dependant on the cache footprint of +the task, how cache intensive the task is, how long it's been running on that +CPU to take up the bulk of its cache, how big the CPU cache is, how fast and +how layered the CPU cache is, how fast a context switch is... and so on. In +other words, it's close to random in the real world where we do more than just +one sole workload. The only thing we can be sure of is that it's not free. So +BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs +is more important than cache locality, and cache locality only plays a part +after that. Doubling the effective deadline is based on the premise that the +"cache local" CPUs will tend to work on the same tasks up to double the number +of cache local CPUs, and once the workload is beyond that amount, it is likely +that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA +is a value I pulled out of my arse. + +Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. +However this benchmarking was performed on an earlier design that was far less +scalable than the current one so it's hard to know how scalable it is in terms +of both CPUs (due to the global runqueue) and heavily loaded machines (due to +O(n) lookup) at this stage. Note that in terms of scalability, the number of +_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) +quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark +results are very promising indeed, without needing to tweak any knobs, features +or options. Benchmark contributions are most welcome. + + +Features + +As the initial prime target audience for BFS was the average desktop user, it +was designed to not need tweaking, tuning or have features set to obtain benefit +from it. Thus the number of knobs and features has been kept to an absolute +minimum and should not require extra user input for the vast majority of cases. +There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval +and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition +to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is +support for CGROUPS. The average user should neither need to know what these +are, nor should they need to be using them to have good desktop behaviour. + +rr_interval + +There is only one "scheduler" tunable, the round robin interval. This can be +accessed in + + /proc/sys/kernel/rr_interval + +The value is in milliseconds, and the default value is set to 6 on a +uniprocessor machine, and automatically set to a progressively higher value on +multiprocessor machines. The reasoning behind increasing the value on more CPUs +is that the effective latency is decreased by virtue of there being more CPUs on +BFS (for reasons explained above), and increasing the value allows for less +cache contention and more throughput. Valid values are from 1 to 5000 +Decreasing the value will decrease latencies at the cost of decreasing +throughput, while increasing it will improve throughput, but at the cost of +worsening latencies. The accuracy of the rr interval is limited by HZ resolution +of the kernel configuration. Thus, the worst case latencies are usually slightly +higher than this actual value. The default value of 6 is not an arbitrary one. +It is based on the fact that humans can detect jitter at approximately 7ms, so +aiming for much lower latencies is pointless under most circumstances. It is +worth noting this fact when comparing the latency performance of BFS to other +schedulers. Worst case latencies being higher than 7ms are far worse than +average latencies not being in the microsecond range. + +Isochronous scheduling. + +Isochronous scheduling is a unique scheduling policy designed to provide +near-real-time performance to unprivileged (ie non-root) users without the +ability to starve the machine indefinitely. Isochronous tasks (which means +"same time") are set using, for example, the schedtool application like so: + + schedtool -I -e amarok + +This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works +is that it has a priority level between true realtime tasks and SCHED_NORMAL +which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, +if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval +rate). However if ISO tasks run for more than a tunable finite amount of time, +they are then demoted back to SCHED_NORMAL scheduling. This finite amount of +time is the percentage of _total CPU_ available across the machine, configurable +as a percentage in the following "resource handling" tunable (as opposed to a +scheduler tunable): + + /proc/sys/kernel/iso_cpu + +and is set to 70% by default. It is calculated over a rolling 5 second average +Because it is the total CPU available, it means that on a multi CPU machine, it +is possible to have an ISO task running as realtime scheduling indefinitely on +just one CPU, as the other CPUs will be available. Setting this to 100 is the +equivalent of giving all users SCHED_RR access and setting it to 0 removes the +ability to run any pseudo-realtime tasks. + +A feature of BFS is that it detects when an application tries to obtain a +realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the +appropriate privileges to use those policies. When it detects this, it will +give the task SCHED_ISO policy instead. Thus it is transparent to the user. +Because some applications constantly set their policy as well as their nice +level, there is potential for them to undo the override specified by the user +on the command line of setting the policy to SCHED_ISO. To counter this, once +a task has been set to SCHED_ISO policy, it needs superuser privileges to set +it back to SCHED_NORMAL. This will ensure the task remains ISO and all child +processes and threads will also inherit the ISO policy. + +Idleprio scheduling. + +Idleprio scheduling is a scheduling policy designed to give out CPU to a task +_only_ when the CPU would be otherwise idle. The idea behind this is to allow +ultra low priority tasks to be run in the background that have virtually no +effect on the foreground tasks. This is ideally suited to distributed computing +clients (like setiathome, folding, mprime etc) but can also be used to start +a video encode or so on without any slowdown of other tasks. To avoid this +policy from grabbing shared resources and holding them indefinitely, if it +detects a state where the task is waiting on I/O, the machine is about to +suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As +per the Isochronous task management, once a task has been scheduled as IDLEPRIO, +it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can +be set to start as SCHED_IDLEPRIO with the schedtool command like so: + + schedtool -D -e ./mprime + +Subtick accounting. + +It is surprisingly difficult to get accurate CPU accounting, and in many cases, +the accounting is done by simply determining what is happening at the precise +moment a timer tick fires off. This becomes increasingly inaccurate as the +timer tick frequency (HZ) is lowered. It is possible to create an application +which uses almost 100% CPU, yet by being descheduled at the right time, records +zero CPU usage. While the main problem with this is that there are possible +security implications, it is also difficult to determine how much CPU a task +really does use. BFS tries to use the sub-tick accounting from the TSC clock, +where possible, to determine real CPU usage. This is not entirely reliable, but +is far more likely to produce accurate CPU usage data than the existing designs +and will not show tasks as consuming no CPU usage when they actually are. Thus, +the amount of CPU reported as being used by BFS will more accurately represent +how much CPU the task itself is using (as is shown for example by the 'time' +application), so the reported values may be quite different to other schedulers. +Values reported as the 'load' are more prone to problems with this design, but +per process values are closer to real usage. When comparing throughput of BFS +to other designs, it is important to compare the actual completed work in terms +of total wall clock time taken and total work done, rather than the reported +"cpu usage". + + +Con Kolivas 14th October 2009.