Work around the inappropriate loss of CPU power across SMT siblings aka hyperthreads to less policy or more niced processes. When making a decision to schedule on a logical CPU that has SMT siblings, examine its siblings to decide whether or not to run a process on that CPU or not according to the following rules: 1. Processes of lesser policies will not run (eg. SCHED_NORMAL vs SCHED_FIFO, SCHED_IDLEPRIO vs SCHED_NORMAL etc.). 2. SCHED_NORMAL processes of higher nice level will not run unless their deadline is better. 3. Kernel threads are allowed to run regardless. This makes both policy and nice level SMT aware, amplifying the advantages of using either. Overall throughput will likely be reduced slightly, though critical task throuput, when run at higher policy or priority, will increase in real world workloads. This includes better behaviour of desktop applications in the normal setting with lots of running background tasks/daemons. -ck --- Index: linux-3.15.5-ck1/kernel/sched/bfs.c =================================================================== --- linux-3.15.5-ck1.orig/kernel/sched/bfs.c 2014-07-29 13:25:22.557185875 +1000 +++ linux-3.15.5-ck1/kernel/sched/bfs.c 2014-08-01 20:08:56.918748397 +1000 @@ -98,9 +98,14 @@ #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) -#define iso_task(p) unlikely((p)->policy == SCHED_ISO) -#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) + +#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) +#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) +#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy))) + +#define is_iso_policy(policy) ((policy) == SCHED_ISO) +#define iso_task(p) unlikely(is_iso_policy((p)->policy)) +#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy)) #define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) #define ISO_PERIOD ((5 * HZ * grq.noc) + 1) @@ -719,8 +724,7 @@ static inline bool scaling_rq(struct rq * Other node, other CPU, busy cache, idle threads. * Other node, other CPU, busy threads. */ -static void -resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) +static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) { int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED | CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | @@ -766,6 +770,12 @@ resched_best_mask(int best_cpu, struct r } } out: + return best_cpu; +} + +static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) +{ + best_cpu = best_mask_cpu(best_cpu, rq, tmpmask); resched_task(cpu_rq(best_cpu)->curr); } @@ -776,12 +786,90 @@ bool cpus_share_cache(int this_cpu, int return (this_rq->cpu_locality[that_cpu] < 3); } -static void resched_best_idle(struct task_struct *p) +#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) + +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_SMT_NICE +#define PRIO_LOWEST (MAX_PRIO + 2) +#define PRIO_IDLE (MAX_PRIO + 1) +static const cpumask_t *thread_cpumask(int cpu); + +/* Find the best real time priority running on any SMT siblings of cpu and if + * none are running, the static priority of the best deadline task running. + * The lookups to the other runqueues is done lockless as the occasional wrong + * value would be harmless. */ +static int best_smt_prio(int cpu) +{ + int best_prio = PRIO_LOWEST, other_cpu; + u64 earliest_deadline = ~0ULL; + + for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) { + struct rq *rq; + + if (other_cpu == cpu) + continue; + rq = cpu_rq(other_cpu); + if (rq_idle(rq)) + continue; + if (!rq->online) + continue; + if (rt_queue(rq)) { + best_prio = MAX_RT_PRIO; + } else if (rq_running_iso(rq)) { + if (likely(best_prio > ISO_PRIO)) + best_prio = ISO_PRIO; + } else if (idle_queue(rq)) { + if (likely(best_prio > PRIO_IDLE)) + best_prio = PRIO_IDLE; + } else if (rq->rq_deadline < earliest_deadline && !rt_prio(best_prio)) { + /* Set best prio according to static priority of best + * deadline task for normal tasks */ + best_prio = rq->rq_static_prio; + earliest_deadline = rq->rq_deadline; + } + } + return best_prio; +} + +/* We've already decided p can run on CPU, now test if it shouldn't for SMT + * nice reasons. */ +static bool smt_should_schedule(struct task_struct *p, int cpu) +{ + int best_prio; + + /* Kernel threads always run */ + if (!p->mm) + return true; + if (rt_task(p)) + return true; + best_prio = best_smt_prio(cpu); + /* The smt siblings are all idle */ + if (best_prio == PRIO_LOWEST) + return true; + if (iso_task(p) && best_prio > MAX_RT_PRIO) + return true; + if (p->static_prio > best_prio) + return false; + if (idleprio_task(p) && best_prio < PRIO_IDLE) + return false; + return true; +} +#endif +#endif + +static bool resched_best_idle(struct task_struct *p) { cpumask_t tmpmask; + int best_cpu; cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); - resched_best_mask(task_cpu(p), task_rq(p), &tmpmask); + best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); +#ifdef CONFIG_SMT_NICE + if (!smt_should_schedule(p, best_cpu)) + return false; +#endif + resched_task(cpu_rq(best_cpu)->curr); + return true; } static inline void resched_suitable_idle(struct task_struct *p) @@ -1078,6 +1166,14 @@ static inline void return_task(struct ta #define tsk_is_polling(t) 0 #endif +static void __send_other_resched(struct task_struct *p, int cpu) +{ + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -1102,10 +1198,7 @@ void resched_task(struct task_struct *p) return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); + __send_other_resched(p, cpu); } /** @@ -1149,21 +1242,8 @@ unsigned long wait_task_inactive(struct struct rq *rq; for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! In the unlikely event rq is dereferenced - * since we're lockless, grab it again. - */ -#ifdef CONFIG_SMP -retry_rq: - rq = task_rq(p); - if (unlikely(!rq)) - goto retry_rq; -#else /* CONFIG_SMP */ rq = task_rq(p); -#endif + /* * If the task is actively running on another CPU * still, just relax and busy-wait without holding @@ -1266,8 +1346,6 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); #endif -#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) - /* * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or @@ -1336,10 +1414,8 @@ static void try_preempt(struct task_stru */ clear_sticky(p); - if (suitable_idle_cpus(p)) { - resched_best_idle(p); + if (suitable_idle_cpus(p) && resched_best_idle(p)) return; - } /* IDLEPRIO tasks never preempt anything but idle */ if (p->policy == SCHED_IDLEPRIO) @@ -1370,6 +1446,11 @@ static void try_preempt(struct task_stru } if (likely(highest_prio_rq)) { +#ifdef CONFIG_SMT_NICE + cpu = cpu_of(highest_prio_rq); + if (!smt_should_schedule(p, cpu)) + return; +#endif if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) resched_task(highest_prio_rq->curr); } @@ -3087,6 +3168,10 @@ task_struct *earliest_deadline_task(stru if (needs_other_cpu(p, cpu)) continue; +#ifdef CONFIG_SMT_NICE + if (!smt_should_schedule(p, cpu)) + continue; +#endif /* * Soft affinity happens here by not scheduling a task * with its sticky flag set that ran on a different CPU @@ -3171,6 +3256,7 @@ static inline void set_rq_task(struct rq rq->rq_last_ran = p->last_ran = rq->clock_task; rq->rq_policy = p->policy; rq->rq_prio = p->prio; + rq->rq_static_prio = p->static_prio; if (p != rq->idle) rq->rq_running = true; else @@ -3181,8 +3267,54 @@ static void reset_rq_task(struct rq *rq, { rq->rq_policy = p->policy; rq->rq_prio = p->prio; + rq->rq_static_prio = p->static_prio; } +#ifdef CONFIG_SMT_NICE +/* Iterate over smt siblings when we've scheduled a process on cpu and decide + * whether they should continue running or be descheduled. */ +static void check_smt_siblings(int cpu) +{ + int other_cpu; + + for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) { + struct task_struct *p; + struct rq *rq; + + if (other_cpu == cpu) + continue; + rq = cpu_rq(other_cpu); + p = rq->curr; + if (smt_should_schedule(p, cpu)) { + set_tsk_need_resched(p); + __send_other_resched(p, other_cpu); + } + } +} + +static void wake_smt_siblings(int cpu) +{ + int other_cpu; + + if (!queued_notrunning()) + return; + + for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) { + struct rq *rq; + + if (other_cpu == cpu) + continue; + rq = cpu_rq(other_cpu); + if (rq_idle(rq)) { + struct task_struct *p = rq->curr; + + set_tsk_need_resched(p); + __send_other_resched(p, other_cpu); + } + } +} +#endif + /* * schedule() is the main scheduler function. * @@ -3349,6 +3481,12 @@ need_resched: if (rt_task(next)) unstick_task(rq, prev); set_rq_task(rq, next); +#ifdef CONFIG_SMT_NICE + if (next != idle) + check_smt_siblings(cpu); + else + wake_smt_siblings(cpu); +#endif grq.nr_switches++; prev->on_cpu = false; next->on_cpu = true; Index: linux-3.15.5-ck1/kernel/sched/bfs_sched.h =================================================================== --- linux-3.15.5-ck1.orig/kernel/sched/bfs_sched.h 2014-07-29 13:25:58.173182579 +1000 +++ linux-3.15.5-ck1/kernel/sched/bfs_sched.h 2014-07-29 15:49:12.648600170 +1000 @@ -17,6 +17,7 @@ struct rq { int rq_time_slice; u64 rq_last_ran; int rq_prio; + int rq_static_prio; bool rq_running; /* There is a task running */ /* Accurate timekeeping data */ Index: linux-3.15.5-ck1/arch/x86/Kconfig =================================================================== --- linux-3.15.5-ck1.orig/arch/x86/Kconfig 2014-07-15 16:39:43.189254676 +1000 +++ linux-3.15.5-ck1/arch/x86/Kconfig 2014-08-01 18:15:51.814376426 +1000 @@ -784,10 +784,26 @@ config SCHED_SMT depends on X86_HT ---help--- SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a + when dealing with Intel P4/Core 2 chips with HyperThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config SMT_NICE + bool "SMT (Hyperthreading) aware nice priority and policy support" + depends on X86_HT && SCHED_BFS + default y + ---help--- + Enabling Hyperthreading on Intel CPUs decreases the effectiveness + of the use of 'nice' levels and different scheduling policies + (e.g. realtime) due to sharing of CPU power between hyperthreads. + SMT nice support makes each logical CPU aware of what is running on + its hyperthread siblings, maintaining appropriate distribution of + CPU according to nice levels and scheduling policies at the expense + of slightly increased overhead. + + If unsure say Y here. + + config SCHED_MC def_bool y prompt "Multi-core scheduler support"