--- arch/x86/Kconfig | 2 include/linux/init_task.h | 6 kernel/sched/bfs.c | 351 ++++++++++++++++++++++++++++++++-------------- kernel/sched/bfs_sched.h | 10 + 4 files changed, 265 insertions(+), 104 deletions(-) Index: linux-4.3-bfs/kernel/sched/bfs.c =================================================================== --- linux-4.3-bfs.orig/kernel/sched/bfs.c 2015-11-12 13:27:12.848609689 +1100 +++ linux-4.3-bfs/kernel/sched/bfs.c 2015-11-12 16:38:10.277799685 +1100 @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -134,7 +135,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "BFS CPU scheduler v0.464 by Con Kolivas.\n"); + printk(KERN_INFO "BFS CPU scheduler v0.465 by Con Kolivas.\n"); } /* @@ -228,6 +229,9 @@ static struct global_rq grq; static DEFINE_MUTEX(sched_hotcpu_mutex); +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; + DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #ifdef CONFIG_SMP struct rq *cpu_rq(int cpu) @@ -240,7 +244,7 @@ struct rq *cpu_rq(int cpu) * sched_domains_mutex serialises calls to init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ -static DEFINE_MUTEX(sched_domains_mutex); +DEFINE_MUTEX(sched_domains_mutex); /* * By default the system creates a single root-domain with all cpus as @@ -990,7 +994,7 @@ void set_task_cpu(struct task_struct *p, if (task_cpu(p) == cpu) return; trace_sched_migrate_task(p, cpu); - perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + perf_event_task_migrate(p); /* * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be @@ -1396,6 +1400,8 @@ static void try_preempt(struct task_stru resched_curr(highest_prio_rq); } } +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check); #else /* CONFIG_SMP */ static inline bool needs_other_cpu(struct task_struct *p, int cpu) { @@ -1409,6 +1415,12 @@ static void try_preempt(struct task_stru if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) resched_curr(uprq); } + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} #endif /* CONFIG_SMP */ static void @@ -1491,7 +1503,7 @@ static inline void ttwu_activate(struct static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, bool success) { - trace_sched_wakeup(p, success); + trace_sched_wakeup(p); p->state = TASK_RUNNING; /* @@ -1554,6 +1566,8 @@ static bool try_to_wake_up(struct task_s if (!((unsigned int)p->state & state)) goto out_unlock; + trace_sched_waking(p); + if (task_queued(p) || task_running(p)) goto out_running; @@ -1590,6 +1604,8 @@ static void try_to_wake_up_local(struct if (!(p->state & TASK_NORMAL)) return; + trace_sched_waking(p); + if (!task_queued(p)) { if (likely(!task_running(p))) { schedstat_inc(rq, ttwu_count); @@ -1676,7 +1692,7 @@ int sched_fork(unsigned long __maybe_unu } INIT_LIST_HEAD(&p->run_list); -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif @@ -1721,7 +1737,7 @@ void wake_up_new_task(struct task_struct p->prio = rq->curr->normal_prio; activate_task(p, rq); - trace_sched_wakeup_new(p, 1); + trace_sched_wakeup_new(p); if (unlikely(p->policy == SCHED_FIFO)) goto after_ts_init; @@ -1766,12 +1782,29 @@ after_ts_init: #ifdef CONFIG_PREEMPT_NOTIFIERS +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; + +void preempt_notifier_inc(void) +{ + static_key_slow_inc(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_inc); + +void preempt_notifier_dec(void) +{ + static_key_slow_dec(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_dec); + /** * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) { + if (!static_key_false(&preempt_notifier_key)) + WARN(1, "registering preempt_notifier while notifiers disabled\n"); + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); } EXPORT_SYMBOL_GPL(preempt_notifier_register); @@ -1780,7 +1813,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_regis * preempt_notifier_unregister - no longer interested in preemption notifications * @notifier: notifier struct to unregister * - * This is safe to call from within a preemption notifier. + * This is *not* safe to call from within a preemption notifier. */ void preempt_notifier_unregister(struct preempt_notifier *notifier) { @@ -1788,7 +1821,7 @@ void preempt_notifier_unregister(struct } EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; @@ -1796,8 +1829,14 @@ static void fire_sched_in_preempt_notifi notifier->ops->sched_in(notifier, raw_smp_processor_id()); } +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_in_preempt_notifiers(curr); +} + static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, +__fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { struct preempt_notifier *notifier; @@ -1806,13 +1845,21 @@ fire_sched_out_preempt_notifiers(struct notifier->ops->sched_out(notifier, next); } +static __always_inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + if (static_key_false(&preempt_notifier_key)) + __fire_sched_out_preempt_notifiers(curr, next); +} + #else /* !CONFIG_PREEMPT_NOTIFIERS */ -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { } -static void +static inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { @@ -1878,15 +1925,14 @@ static struct rq *finish_task_switch(str * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); - finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -1963,9 +2009,7 @@ context_switch(struct rq *rq, struct tas spin_release(&grq.lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ - context_tracking_task_switch(prev, next); switch_to(prev, next, prev); - barrier(); return finish_task_switch(prev); @@ -1999,6 +2043,16 @@ static unsigned long nr_uninterruptible( /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { @@ -2045,10 +2099,10 @@ unsigned long nr_active(void) * set to this cpu as being the CPU they're more likely to run on. */ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { - struct rq *this = this_rq(); + struct rq *rq = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->soft_affined; + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->soft_affined; } /* Variables and functions for calc_load */ @@ -3353,21 +3407,16 @@ static void wake_smt_siblings(int __mayb * - return from syscall or exception to user-space * - return from interrupt-handler to user-space * - * WARNING: all callers must re-check need_resched() afterward and reschedule - * accordingly in case an event triggered the need for rescheduling (such as - * an interrupt waking up a task) while preemption was disabled in __schedule(). + * WARNING: must be called with preemption disabled! */ static void __sched __schedule(void) { struct task_struct *prev, *next, *idle; unsigned long *switch_count; - bool deactivate; + bool deactivate = false; struct rq *rq; int cpu; -need_resched: - deactivate = false; - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_note_context_switch(); @@ -3413,17 +3462,6 @@ need_resched: switch_count = &prev->nvcsw; } - /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. - */ - if (unlikely(deactivate && blk_needs_flush_plug(prev))) { - grq_unlock_irq(); - preempt_enable_no_resched(); - blk_schedule_flush_plug(prev); - goto need_resched; - } - update_clocks(rq); update_cpu_clock_switch(rq, prev); if (rq->clock - rq->last_tick > HALF_JIFFY_NS) @@ -3511,13 +3549,33 @@ need_resched: } rerun_prev_unlocked: - sched_preempt_enable_no_resched(); + return; +} + +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state || tsk_is_pi_blocked(tsk) || + (preempt_count() & PREEMPT_ACTIVE) || + signal_pending_state(tsk->state, tsk)) + return; + + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); } asmlinkage __visible void __sched schedule(void) { + struct task_struct *tsk = current; + + sched_submit_work(tsk); do { + preempt_disable(); __schedule(); + sched_preempt_enable_no_resched(); } while (need_resched()); } @@ -3557,15 +3615,14 @@ void __sched schedule_preempt_disabled(v static void __sched notrace preempt_schedule_common(void) { do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_active_enter(); __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + preempt_active_exit(); /* * Check again in case we missed a preemption opportunity * between schedule and now. */ - barrier(); } while (need_resched()); } @@ -3589,9 +3646,8 @@ asmlinkage __visible void __sched notrac NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); -#ifdef CONFIG_CONTEXT_TRACKING /** - * preempt_schedule_context - preempt_schedule called by tracing + * preempt_schedule_notrace - preempt_schedule called by tracing * * The tracing infrastructure uses preempt_enable_notrace to prevent * recursion and tracing preempt enabling caused by the tracing @@ -3604,7 +3660,7 @@ EXPORT_SYMBOL(preempt_schedule); * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) { enum ctx_state prev_ctx; @@ -3612,7 +3668,13 @@ asmlinkage __visible void __sched notrac return; do { - __preempt_count_add(PREEMPT_ACTIVE); + /* + * Use raw __prempt_count() ops that don't call function. + * We can't call functions before disabling preemption which + * disarm preemption tracing recursions. + */ + __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + barrier(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3622,12 +3684,11 @@ asmlinkage __visible void __sched notrac __schedule(); exception_exit(prev_ctx); - __preempt_count_sub(PREEMPT_ACTIVE); barrier(); + __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); } while (need_resched()); } -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_CONTEXT_TRACKING */ +EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPT */ @@ -3647,17 +3708,11 @@ asmlinkage __visible void __sched preemp prev_state = exception_enter(); do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_active_enter(); local_irq_enable(); - schedule(); + __schedule(); local_irq_disable(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); + preempt_active_exit(); } while (need_resched()); exception_exit(prev_state); @@ -3911,9 +3966,16 @@ static void __setscheduler(struct task_s * Keep a potential priority boosting if called from * sched_setscheduler(). */ - if (keep_boost) + if (keep_boost) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ p->prio = rt_mutex_get_effective_prio(p, p->normal_prio); - else + } else p->prio = p->normal_prio; if (task_running(p)) { reset_rq_task(rq, p); @@ -3939,8 +4001,9 @@ static bool check_same_owner(struct task return match; } -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) +static int +__sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool user, bool pi) { struct sched_param zero_param = { .sched_priority = 0 }; int queued, retval, oldpolicy = -1; @@ -4095,7 +4158,7 @@ recheck: queued = task_queued(p); if (queued) dequeue_task(p); - __setscheduler(p, rq, policy, param->sched_priority, true); + __setscheduler(p, rq, policy, param->sched_priority, pi); if (queued) { enqueue_task(p, rq); try_preempt(p, rq); @@ -4103,7 +4166,8 @@ recheck: __task_grq_unlock(); raw_spin_unlock_irqrestore(&p->pi_lock, flags); - rt_mutex_adjust_pi(p); + if (pi) + rt_mutex_adjust_pi(p); out: return 0; } @@ -4121,7 +4185,7 @@ out: int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, true); + return __sched_setscheduler(p, policy, param, true, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -4131,7 +4195,7 @@ int sched_setattr(struct task_struct *p, const struct sched_param param = { .sched_priority = attr->sched_priority }; int policy = attr->sched_policy; - return __sched_setscheduler(p, policy, ¶m, true); + return __sched_setscheduler(p, policy, ¶m, true, true); } EXPORT_SYMBOL_GPL(sched_setattr); @@ -4151,7 +4215,7 @@ EXPORT_SYMBOL_GPL(sched_setattr); int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, false); + return __sched_setscheduler(p, policy, param, false, true); } static int @@ -4521,7 +4585,7 @@ long sched_setaffinity(pid_t pid, const cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4680,7 +4744,7 @@ SYSCALL_DEFINE0(sched_yield) int __sched _cond_resched(void) { - if (should_resched()) { + if (should_resched(0)) { preempt_schedule_common(); return 1; } @@ -4698,7 +4762,7 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - int resched = should_resched(); + int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; lockdep_assert_held(lock); @@ -4720,7 +4784,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (should_resched()) { + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { local_bh_enable(); preempt_schedule_common(); local_bh_disable(); @@ -5012,6 +5076,12 @@ void dump_cpu_task(int cpu) } #ifdef CONFIG_SMP +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +{ + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { cpumask_copy(tsk_cpus_allowed(p), new_mask); @@ -5031,7 +5101,8 @@ void init_idle(struct task_struct *idle, struct rq *rq = cpu_rq(cpu); unsigned long flags; - time_grq_lock(rq, &flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + time_lock_grq(rq); idle->last_ran = rq->clock_task; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ @@ -5047,13 +5118,14 @@ void init_idle(struct task_struct *idle, rcu_read_unlock(); rq->curr = rq->idle = idle; idle->on_cpu = 1; - grq_unlock_irqrestore(&flags); + grq_unlock(); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); ftrace_graph_init_idle_task(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } @@ -5084,6 +5156,52 @@ int task_can_attach(struct task_struct * return ret; } +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_list(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } +} + void resched_cpu(int cpu) { unsigned long flags; @@ -5149,24 +5267,26 @@ static inline struct sched_domain *lowes * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu = smp_processor_id(); - int i; + int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) return cpu; rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { + if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { cpu = i; goto unlock; } } } + + if (!is_housekeeping_cpu(cpu)) + cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); return cpu; @@ -5206,7 +5326,8 @@ void wake_up_nohz_cpu(int cpu) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { bool running_wrong = false; bool queued = false; @@ -5216,6 +5337,15 @@ int set_cpus_allowed_ptr(struct task_str rq = task_grq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(tsk_cpus_allowed(p), new_mask)) goto out; @@ -5252,23 +5382,26 @@ out: return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); #ifdef CONFIG_HOTPLUG_CPU -extern struct task_struct *cpu_stopper_task; /* Run through task list and find tasks affined to the dead cpu, then remove * that cpu from the list, enable cpu0 and set the zerobound flag. */ static void bind_zero(int src_cpu) { - struct task_struct *p, *t, *stopper; + struct task_struct *p, *t; int bound = 0; if (src_cpu == 0) return; - stopper = per_cpu(cpu_stopper_task, src_cpu); do_each_thread(t, p) { - if (p != stopper && cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { + if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p)); cpumask_set_cpu(0, tsk_cpus_allowed(p)); p->zerobound = true; @@ -5522,20 +5655,19 @@ static void register_sched_domain_sysctl /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); } -#else +#else /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ static void register_sched_domain_sysctl(void) { } static void unregister_sched_domain_sysctl(void) { } -#endif +#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ static void set_rq_online(struct rq *rq) { @@ -5623,6 +5755,16 @@ static int sched_cpu_active(struct notif unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + return NOTIFY_OK; + case CPU_ONLINE: + /* + * At this point a starting CPU has marked itself as online via + * set_cpu_online(). But it might not yet have marked itself + * as active, which is essential from here on. + * + * Thus, fall-through and help the starting CPU along. + */ case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -5661,9 +5803,6 @@ int __init migration_init(void) return 0; } early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ @@ -5941,9 +6080,6 @@ cpu_attach_domain(struct sched_domain *s destroy_sched_domains(tmp, cpu); } -/* cpus with isolated domains */ -cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { @@ -6451,7 +6587,7 @@ static int __sdt_alloc(const struct cpum for_each_cpu(j, cpu_map) { struct sched_domain *sd; - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sd) return -ENOMEM; @@ -6835,7 +6971,7 @@ static bool sole_cpu_idle(int cpu) #ifdef CONFIG_SCHED_SMT static const cpumask_t *thread_cpumask(int cpu) { - return topology_thread_cpumask(cpu); + return topology_sibling_cpumask(cpu); } /* All this CPU's SMT siblings are idle */ static bool siblings_cpu_idle(int cpu) @@ -6899,6 +7035,7 @@ void __init sched_init_smp(void) BUG(); free_cpumask_var(non_isolated_cpus); + mutex_lock(&sched_domains_mutex); grq_lock_irq(); /* * Set up the relative cache distance of each online cpu from each @@ -6943,6 +7080,7 @@ void __init sched_init_smp(void) #endif } grq_unlock_irq(); + mutex_unlock(&sched_domains_mutex); for_each_online_cpu(cpu) { struct rq *rq = cpu_rq(cpu); @@ -6959,8 +7097,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -7137,7 +7273,7 @@ EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ -void normalize_rt_tasks(void) +static inline void normalise_rt_tasks(void) { struct task_struct *g, *p; unsigned long flags; @@ -7146,6 +7282,12 @@ void normalize_rt_tasks(void) read_lock(&tasklist_lock); for_each_process_thread(g, p) { + /* + * Only normalize user tasks: + */ + if (p->flags & PF_KTHREAD) + continue; + if (!rt_task(p) && !iso_task(p)) continue; @@ -7163,6 +7305,11 @@ void normalize_rt_tasks(void) } read_unlock(&tasklist_lock); } + +void normalize_rt_tasks(void) +{ + normalise_rt_tasks(); +} #endif /* CONFIG_MAGIC_SYSRQ */ #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) @@ -7308,7 +7455,7 @@ drop_precision: * runtime accounting. */ static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, + struct prev_cputime *prev, cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime, total; Index: linux-4.3-bfs/kernel/sched/bfs_sched.h =================================================================== --- linux-4.3-bfs.orig/kernel/sched/bfs_sched.h 2015-11-12 13:27:12.849609684 +1100 +++ linux-4.3-bfs/kernel/sched/bfs_sched.h 2015-11-12 15:13:49.242155354 +1100 @@ -110,7 +110,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq static inline u64 __rq_clock_broken(struct rq *rq) { - return ACCESS_ONCE(rq->clock); + return READ_ONCE(rq->clock); } static inline u64 rq_clock(struct rq *rq) @@ -125,6 +125,8 @@ static inline u64 rq_clock_task(struct r return rq->clock_task; } +extern struct mutex sched_domains_mutex; + #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -146,6 +148,12 @@ static inline int task_on_rq_queued(stru return p->on_rq; } +#ifdef CONFIG_SMP + +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + +#endif + #ifdef CONFIG_CPU_IDLE static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) Index: linux-4.3-bfs/arch/x86/Kconfig =================================================================== --- linux-4.3-bfs.orig/arch/x86/Kconfig 2015-11-12 13:27:12.849609684 +1100 +++ linux-4.3-bfs/arch/x86/Kconfig 2015-11-12 15:23:15.342061860 +1100 @@ -883,7 +883,7 @@ config SCHED_SMT config SMT_NICE bool "SMT (Hyperthreading) aware nice priority and policy support" - depends on X86_HT && SCHED_BFS && SCHED_SMT + depends on SCHED_BFS && SCHED_SMT default y ---help--- Enabling Hyperthreading on Intel CPUs decreases the effectiveness Index: linux-4.3-bfs/include/linux/init_task.h =================================================================== --- linux-4.3-bfs.orig/include/linux/init_task.h 2015-11-12 13:27:12.843609716 +1100 +++ linux-4.3-bfs/include/linux/init_task.h 2015-11-12 16:16:01.620447464 +1100 @@ -254,6 +254,12 @@ extern struct task_group root_task_group INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_TASK_RCU_TASKS(tsk) \ + INIT_CPUSET_SEQ(tsk) \ + INIT_RT_MUTEXES(tsk) \ + INIT_PREV_CPUTIME(tsk) \ + INIT_VTIME(tsk) \ + INIT_NUMA_BALANCING(tsk) \ INIT_KASAN(tsk) \ } #else /* CONFIG_SCHED_BFS */