Experimental Dual Policy extension to SMT (aka hyperthread) CPUs for BFS scheduler designed to further improve throughput by using secondary and beyond SMT threads per core with a FIFO-like cooperative scheduling policy for normal tasks to make the most of uninterrupted running and cache localisation effects while leaving the primary threads per core free to service low latency tasks. -ck --- kernel/sched/bfs.c | 89 ++++++++++++++++++++++++++++++++++++++--------- kernel/sched/bfs_sched.h | 1 2 files changed, 73 insertions(+), 17 deletions(-) Index: linux-3.15.5-ck1/kernel/sched/bfs.c =================================================================== --- linux-3.15.5-ck1.orig/kernel/sched/bfs.c 2014-07-18 11:21:26.345767043 +1000 +++ linux-3.15.5-ck1/kernel/sched/bfs.c 2014-07-18 11:21:26.343767043 +1000 @@ -183,6 +183,8 @@ struct global_rq { }; #ifdef CONFIG_SMP +static cpumask_t schedule_cores __read_mostly; +static cpumask_t schedule_threads __read_mostly; /* * We add the notion of a root-domain which will be used to define per-domain @@ -693,13 +695,14 @@ static bool suitable_idle_cpus(struct ta return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map)); } -#define CPUIDLE_DIFF_THREAD (1) -#define CPUIDLE_DIFF_CORE (2) -#define CPUIDLE_CACHE_BUSY (4) -#define CPUIDLE_DIFF_CPU (8) -#define CPUIDLE_THREAD_BUSY (16) -#define CPUIDLE_THROTTLED (32) -#define CPUIDLE_DIFF_NODE (64) +#define CPUIDLE_SMT_THREAD (1) +#define CPUIDLE_DIFF_THREAD (2) +#define CPUIDLE_DIFF_CORE (4) +#define CPUIDLE_CACHE_BUSY (8) +#define CPUIDLE_DIFF_CPU (16) +#define CPUIDLE_THREAD_BUSY (32) +#define CPUIDLE_THROTTLED (64) +#define CPUIDLE_DIFF_NODE (128) static void resched_task(struct task_struct *p); static inline bool scaling_rq(struct rq *rq); @@ -718,6 +721,8 @@ static inline bool scaling_rq(struct rq * Other node, other CPU, idle cache, idle threads. * Other node, other CPU, busy cache, idle threads. * Other node, other CPU, busy threads. + * + * Secondary SMT threads are preferred to their primary SMT core thread. */ static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) @@ -737,6 +742,11 @@ resched_best_mask(int best_cpu, struct r ranking = 0; tmp_rq = cpu_rq(cpu_tmp); +#ifdef CONFIG_SCHED_SMT + if (!tmp_rq->thread) + ranking |= CPUIDLE_SMT_THREAD; +#endif + locality = rq->cpu_locality[cpu_tmp]; #ifdef CONFIG_NUMA if (locality > 3) @@ -1345,10 +1355,11 @@ static void try_preempt(struct task_stru if (p->policy == SCHED_IDLEPRIO) return; - if (likely(online_cpus(p))) - cpus_and(tmp, cpu_online_map, p->cpus_allowed); - else - return; + if (unlikely(!online_cpus(p))) + return; + + /* Normal cores we preempt if p's deadline is earlier than any running one */ + cpus_and(tmp, schedule_cores, p->cpus_allowed); highest_prio = latest_deadline = 0; @@ -1369,10 +1380,32 @@ static void try_preempt(struct task_stru } } - if (likely(highest_prio_rq)) { - if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) - resched_task(highest_prio_rq->curr); + if (likely(highest_prio_rq) && (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline))) { + resched_task(highest_prio_rq->curr); + return; + } + + /* On SMT threads p only preempts if it's a better static priority */ + cpus_and(tmp, schedule_threads, p->cpus_allowed); + if (!cpumask_weight(&tmp)) + return; + + highest_prio = 0; + highest_prio_rq = NULL; + for_each_cpu_mask(cpu, tmp) { + struct rq *rq; + int rq_prio; + + rq = cpu_rq(cpu); + rq_prio = rq->curr->static_prio; + if (rq_prio > highest_prio) { + highest_prio = rq_prio; + highest_prio_rq = rq; + } } + + if (highest_prio > p->static_prio) + resched_task(highest_prio_rq->curr); } #else /* CONFIG_SMP */ static inline bool needs_other_cpu(struct task_struct *p, int cpu) @@ -2823,10 +2856,20 @@ static void task_running_tick(struct rq else rq->rq_time_slice = 0; } else if (rq->rq_time_slice >= RESCHED_US) - return; + return; + p = rq->curr; +#ifdef CONFIG_SCHED_SMT + if (rq->thread && !has_rt_policy(p)) { + /* There is no expiration of timeslices on threads, they run until they + * no longer want CPU or get preempted by better static prio tasks. + * Avoid false positives for soft lockups. */ + if (!(jiffies % HZ)) + touch_softlockup_watchdog(); + return; + } +#endif /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ - p = rq->curr; grq_lock(); requeue_task(p); set_tsk_need_resched(p); @@ -6694,6 +6737,8 @@ void __init sched_init_smp(void) for_each_online_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + cpumask_set_cpu(cpu, &schedule_cores); + /* First check if this cpu is in the same node */ for_each_domain(cpu, sd) { if (sd->level > SD_LV_NODE) @@ -6718,8 +6763,16 @@ void __init sched_init_smp(void) rq->cache_idle = cache_cpu_idle; #endif #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) + for_each_cpu_mask(other_cpu, *thread_cpumask(cpu)) { rq->cpu_locality[other_cpu] = 1; + if (other_cpu < cpu) { + /* Flag this as an SMT sibling that should not be part + * of the scheduling cores but in the threads group */ + rq->thread = true; + cpumask_clear_cpu(cpu, &schedule_cores); + cpumask_set_cpu(cpu, &schedule_threads); + } + } if (cpus_weight(*thread_cpumask(cpu)) > 1) rq->siblings_idle = siblings_cpu_idle; #endif @@ -6771,6 +6824,8 @@ void __init sched_init(void) init_defrootdomain(); grq.qnr = grq.idle_cpus = 0; cpumask_clear(&grq.cpu_idle_map); + cpumask_clear(&schedule_cores); + cpumask_clear(&schedule_threads); #else uprq = &per_cpu(runqueues, 0); #endif Index: linux-3.15.5-ck1/kernel/sched/bfs_sched.h =================================================================== --- linux-3.15.5-ck1.orig/kernel/sched/bfs_sched.h 2014-07-18 11:21:26.345767043 +1000 +++ linux-3.15.5-ck1/kernel/sched/bfs_sched.h 2014-07-18 11:21:26.343767043 +1000 @@ -37,6 +37,7 @@ struct rq { #ifdef CONFIG_SCHED_SMT bool (*siblings_idle)(int cpu); /* See if all smt siblings are idle */ + bool thread; #endif /* CONFIG_SCHED_SMT */ #ifdef CONFIG_SCHED_MC bool (*cache_idle)(int cpu);