Work around the inappropriate loss of CPU power across SMT siblings aka hyperthreads to less policy or more niced processes. When making a decision to schedule on a logical CPU that has SMT siblings, examine its siblings to decide whether or not to run a process on that CPU or not according to the following rules: 1. Processes of lesser policies will not run (eg. SCHED_NORMAL vs SCHED_FIFO, SCHED_IDLEPRIO vs SCHED_NORMAL etc.). 2. SCHED_NORMAL processes of higher nice level will not run unless their deadline is better. 3. Kernel threads are allowed to run regardless. This makes both policy and nice level SMT aware, amplifying the advantages of using either. Overall throughput will likely be reduced slightly, though critical task throuput, when run at higher policy or priority, will increase in real world workloads. This includes better behaviour of desktop applications in the normal setting with lots of running background tasks/daemons. -ck --- Index: linux-3.15.5-ck1/kernel/sched/bfs.c =================================================================== --- linux-3.15.5-ck1.orig/kernel/sched/bfs.c 2014-08-10 22:11:14.867751246 +1000 +++ linux-3.15.5-ck1/kernel/sched/bfs.c 2014-08-15 22:36:14.873564235 +1000 @@ -98,9 +98,16 @@ #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) -#define iso_task(p) unlikely((p)->policy == SCHED_ISO) -#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) + +#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) +#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) +#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) +#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy))) + +#define is_iso_policy(policy) ((policy) == SCHED_ISO) +#define iso_task(p) unlikely(is_iso_policy((p)->policy)) +#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy)) +#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) #define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) #define ISO_PERIOD ((5 * HZ * grq.noc) + 1) @@ -719,8 +726,7 @@ static inline bool scaling_rq(struct rq * Other node, other CPU, busy cache, idle threads. * Other node, other CPU, busy threads. */ -static void -resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) +static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) { int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED | CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | @@ -766,6 +772,12 @@ resched_best_mask(int best_cpu, struct r } } out: + return best_cpu; +} + +static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) +{ + best_cpu = best_mask_cpu(best_cpu, rq, tmpmask); resched_task(cpu_rq(best_cpu)->curr); } @@ -776,12 +788,82 @@ bool cpus_share_cache(int this_cpu, int return (this_rq->cpu_locality[that_cpu] < 3); } -static void resched_best_idle(struct task_struct *p) +#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) + +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_SMT_NICE +static const cpumask_t *thread_cpumask(int cpu); + +/* Find the best real time priority running on any SMT siblings of cpu and if + * none are running, the static priority of the best deadline task running. + * The lookups to the other runqueues is done lockless as the occasional wrong + * value would be harmless. */ +static int best_smt_bias(int cpu) +{ + int other_cpu, best_bias = 0; + + for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) { + struct rq *rq; + + if (other_cpu == cpu) + continue; + rq = cpu_rq(other_cpu); + if (rq_idle(rq)) + continue; + if (!rq->online) + continue; + if (likely(rq->rq_smt_bias > best_bias)) + best_bias = rq->rq_smt_bias; + } + return best_bias; +} + +static int task_prio_bias(struct task_struct *p) +{ + if (rt_task(p)) + return 1 << 30; + else if (task_running_iso(p)) + return 1 << 29; + else if (task_running_idle(p)) + return 0; + return MAX_PRIO - p->static_prio; +} + +/* We've already decided p can run on CPU, now test if it shouldn't for SMT + * nice reasons. */ +static bool smt_should_schedule(struct task_struct *p, int cpu) +{ + int best_bias; + + /* Kernel threads always run */ + if (unlikely(!p->mm)) + return true; + if (rt_task(p)) + return true; + best_bias = best_smt_bias(cpu); + /* The smt siblings are all idle or running IDLEPRIO */ + if (best_bias < 1) + return true; + if (task_prio_bias(p) < best_bias) + return false; + return true; +} +#endif +#endif + +static bool resched_best_idle(struct task_struct *p) { cpumask_t tmpmask; + int best_cpu; cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); - resched_best_mask(task_cpu(p), task_rq(p), &tmpmask); + best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); +#ifdef CONFIG_SMT_NICE + if (!smt_should_schedule(p, best_cpu)) + return false; +#endif + resched_task(cpu_rq(best_cpu)->curr); + return true; } static inline void resched_suitable_idle(struct task_struct *p) @@ -1078,6 +1160,14 @@ static inline void return_task(struct ta #define tsk_is_polling(t) 0 #endif +static void __send_other_resched(struct task_struct *p, int cpu) +{ + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -1102,10 +1192,7 @@ void resched_task(struct task_struct *p) return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); + __send_other_resched(p, cpu); } /** @@ -1253,8 +1340,6 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); #endif -#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) - /* * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or @@ -1323,10 +1408,8 @@ static void try_preempt(struct task_stru */ clear_sticky(p); - if (suitable_idle_cpus(p)) { - resched_best_idle(p); + if (suitable_idle_cpus(p) && resched_best_idle(p)) return; - } /* IDLEPRIO tasks never preempt anything but idle */ if (p->policy == SCHED_IDLEPRIO) @@ -1357,6 +1440,11 @@ static void try_preempt(struct task_stru } if (likely(highest_prio_rq)) { +#ifdef CONFIG_SMT_NICE + cpu = cpu_of(highest_prio_rq); + if (!smt_should_schedule(p, cpu)) + return; +#endif if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) resched_task(highest_prio_rq->curr); } @@ -2946,6 +3034,21 @@ static void time_slice_expired(struct ta { p->time_slice = timeslice(); p->deadline = grq.niffies + task_deadline_diff(p); +#ifdef CONFIG_SMT_NICE + if (!p->mm) + p->smt_bias = 0; + else if (rt_task(p)) + p->smt_bias = 1 << 30; + else if (task_running_iso(p)) + p->smt_bias = 1 << 29; + else if (idleprio_task(p)) { + if (task_running_idle(p)) + p->smt_bias = 0; + else + p->smt_bias = 1; + } else if (--p->smt_bias < 1) + p->smt_bias = MAX_PRIO - p->static_prio; +#endif } /* @@ -3074,6 +3177,10 @@ task_struct *earliest_deadline_task(stru if (needs_other_cpu(p, cpu)) continue; +#ifdef CONFIG_SMT_NICE + if (!smt_should_schedule(p, cpu)) + continue; +#endif /* * Soft affinity happens here by not scheduling a task * with its sticky flag set that ran on a different CPU @@ -3158,6 +3265,9 @@ static inline void set_rq_task(struct rq rq->rq_last_ran = p->last_ran = rq->clock_task; rq->rq_policy = p->policy; rq->rq_prio = p->prio; +#ifdef CONFIG_SMT_NICE + rq->rq_smt_bias = p->smt_bias; +#endif if (p != rq->idle) rq->rq_running = true; else @@ -3168,8 +3278,63 @@ static void reset_rq_task(struct rq *rq, { rq->rq_policy = p->policy; rq->rq_prio = p->prio; +#ifdef CONFIG_SMT_NICE + rq->rq_smt_bias = p->smt_bias; +#endif } +#ifdef CONFIG_SMT_NICE +/* Iterate over smt siblings when we've scheduled a process on cpu and decide + * whether they should continue running or be descheduled. */ +static void check_smt_siblings(int cpu) +{ + int other_cpu; + + for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) { + struct task_struct *p; + struct rq *rq; + + if (other_cpu == cpu) + continue; + rq = cpu_rq(other_cpu); + if (rq_idle(rq)) + continue; + if (!rq->online) + continue; + p = rq->curr; + if (!smt_should_schedule(p, cpu)) { + set_tsk_need_resched(p); + __send_other_resched(p, other_cpu); + } + } +} + +static void wake_smt_siblings(int cpu) +{ + int other_cpu; + + if (!queued_notrunning()) + return; + + for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) { + struct rq *rq; + + if (other_cpu == cpu) + continue; + rq = cpu_rq(other_cpu); + if (rq_idle(rq)) { + struct task_struct *p = rq->curr; + + set_tsk_need_resched(p); + __send_other_resched(p, other_cpu); + } + } +} +#else +static void check_smt_siblings(int __maybe_unused cpu) {} +static void wake_smt_siblings(int __maybe_unused cpu) {} +#endif + /* * schedule() is the main scheduler function. * @@ -3303,6 +3468,7 @@ need_resched: * again. */ set_rq_task(rq, prev); + check_smt_siblings(cpu); grq_unlock_irq(); goto rerun_prev_unlocked; } else @@ -3336,6 +3502,10 @@ need_resched: if (rt_task(next)) unstick_task(rq, prev); set_rq_task(rq, next); + if (next != idle) + check_smt_siblings(cpu); + else + wake_smt_siblings(cpu); grq.nr_switches++; prev->on_cpu = false; next->on_cpu = true; @@ -3352,8 +3522,10 @@ need_resched: cpu = smp_processor_id(); rq = cpu_rq(cpu); idle = rq->idle; - } else + } else { + check_smt_siblings(cpu); grq_unlock_irq(); + } rerun_prev_unlocked: sched_preempt_enable_no_resched(); @@ -4833,6 +5005,9 @@ void init_idle(struct task_struct *idle, idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ idle->prio = PRIO_LIMIT; +#ifdef CONFIG_SMT_NICE + idle->smt_bias = 0; +#endif set_rq_task(rq, idle); do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu)); /* Silence PROVE_RCU */ Index: linux-3.15.5-ck1/kernel/sched/bfs_sched.h =================================================================== --- linux-3.15.5-ck1.orig/kernel/sched/bfs_sched.h 2014-08-10 22:11:14.867751246 +1000 +++ linux-3.15.5-ck1/kernel/sched/bfs_sched.h 2014-08-10 22:11:15.172751246 +1000 @@ -18,7 +18,9 @@ struct rq { u64 rq_last_ran; int rq_prio; bool rq_running; /* There is a task running */ - +#ifdef CONFIG_SMT_NICE + int rq_smt_bias; /* Policy/nice level bias across smt siblings */ +#endif /* Accurate timekeeping data */ u64 timekeep_clock; unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, Index: linux-3.15.5-ck1/arch/x86/Kconfig =================================================================== --- linux-3.15.5-ck1.orig/arch/x86/Kconfig 2014-08-10 22:11:15.014751246 +1000 +++ linux-3.15.5-ck1/arch/x86/Kconfig 2014-08-10 22:11:15.012751246 +1000 @@ -784,10 +784,26 @@ config SCHED_SMT depends on X86_HT ---help--- SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a + when dealing with Intel P4/Core 2 chips with HyperThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config SMT_NICE + bool "SMT (Hyperthreading) aware nice priority and policy support" + depends on X86_HT && SCHED_BFS + default y + ---help--- + Enabling Hyperthreading on Intel CPUs decreases the effectiveness + of the use of 'nice' levels and different scheduling policies + (e.g. realtime) due to sharing of CPU power between hyperthreads. + SMT nice support makes each logical CPU aware of what is running on + its hyperthread siblings, maintaining appropriate distribution of + CPU according to nice levels and scheduling policies at the expense + of slightly increased overhead. + + If unsure say Y here. + + config SCHED_MC def_bool y prompt "Multi-core scheduler support" Index: linux-3.15.5-ck1/include/linux/sched.h =================================================================== --- linux-3.15.5-ck1.orig/include/linux/sched.h 2014-08-10 22:11:15.175751246 +1000 +++ linux-3.15.5-ck1/include/linux/sched.h 2014-08-10 22:11:15.171751246 +1000 @@ -1204,6 +1204,9 @@ struct task_struct { struct list_head run_list; u64 last_ran; u64 sched_time; /* sched_clock time spent running */ +#ifdef CONFIG_SMT_NICE + int smt_bias; /* Policy/nice level bias across smt siblings */ +#endif #ifdef CONFIG_SMP bool sticky; /* Soft affined flag */ #endif