--- include/linux/sched.h | 3 - kernel/sched/bfs.c | 89 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 76 insertions(+), 16 deletions(-) Index: linux-3.17.2-bfsdev/include/linux/sched.h =================================================================== --- linux-3.17.2-bfsdev.orig/include/linux/sched.h 2014-11-11 10:14:40.900721059 +1100 +++ linux-3.17.2-bfsdev/include/linux/sched.h 2014-11-11 10:35:54.776798652 +1100 @@ -1237,10 +1237,7 @@ struct task_struct { int wake_cpu; #endif -#ifndef CONFIG_SCHED_BFS int on_rq; -#endif - int prio, static_prio, normal_prio; unsigned int rt_priority; #ifdef CONFIG_SCHED_BFS Index: linux-3.17.2-bfsdev/kernel/sched/bfs.c =================================================================== --- linux-3.17.2-bfsdev.orig/kernel/sched/bfs.c 2014-11-11 10:14:40.903721059 +1100 +++ linux-3.17.2-bfsdev/kernel/sched/bfs.c 2014-11-11 11:12:16.466931540 +1100 @@ -372,6 +372,8 @@ static inline void update_rq_clock(struc { s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + if (unlikely(delta < 0)) + return; rq->clock += delta; update_rq_clock_task(rq, delta); } @@ -669,6 +671,13 @@ static inline int task_timeslice(struct return (rr_interval * task_prio_ratio(p) / 128); } +static void resched_task(struct task_struct *p); + +static inline void resched_curr(struct rq *rq) +{ + resched_task(rq->curr); +} + #ifdef CONFIG_SMP /* * qnr is the "queued but not running" count which is the total number of @@ -726,7 +735,6 @@ static bool suitable_idle_cpus(struct ta #define CPUIDLE_THROTTLED (32) #define CPUIDLE_DIFF_NODE (64) -static void resched_task(struct task_struct *p); static inline bool scaling_rq(struct rq *rq); /* @@ -796,7 +804,7 @@ out: static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) { best_cpu = best_mask_cpu(best_cpu, rq, tmpmask); - resched_task(cpu_rq(best_cpu)->curr); + resched_curr(cpu_rq(best_cpu)); } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -885,7 +893,7 @@ static bool resched_best_idle(struct tas if (!smt_should_schedule(p, best_cpu)) return false; #endif - resched_task(cpu_rq(best_cpu)->curr); + resched_curr(cpu_rq(best_cpu)); return true; } @@ -1281,7 +1289,7 @@ unsigned long wait_task_inactive(struct rq = task_grq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(p); - on_rq = task_queued(p); + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -1464,7 +1472,7 @@ static void try_preempt(struct task_stru return; #endif if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) - resched_task(highest_prio_rq->curr); + resched_curr(highest_prio_rq); } } #else /* CONFIG_SMP */ @@ -1478,7 +1486,7 @@ static void try_preempt(struct task_stru if (p->policy == SCHED_IDLEPRIO) return; if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) - resched_task(uprq->curr); + resched_curr(uprq); } #endif /* CONFIG_SMP */ @@ -1528,6 +1536,7 @@ static inline void ttwu_activate(struct bool is_sync) { activate_task(p, rq); + p->on_rq = 1; /* * Sync wakeups (i.e. those types of wakeups where the waker @@ -1696,6 +1705,7 @@ int sched_fork(unsigned long __maybe_unu */ /* Should be reset in fork.c but done here for ease of bfs patching */ + p->on_rq = p->utime = p->stime = p->utimescaled = @@ -1771,6 +1781,7 @@ void wake_up_new_task(struct task_struct p->prio = rq->curr->normal_prio; activate_task(p, rq); + p->on_rq = 1; trace_sched_wakeup_new(p, 1); if (unlikely(p->policy == SCHED_FIFO)) goto after_ts_init; @@ -2079,10 +2090,17 @@ unsigned long nr_active(void) } /* Beyond a task running on this CPU, load is equal everywhere on BFS */ -unsigned long this_cpu_load(void) +static inline unsigned long cpu_load(struct rq *rq) { - return this_rq()->rq_running + - ((queued_notrunning() + nr_uninterruptible()) / grq.noc); + return rq->rq_running + ((queued_notrunning() + nr_uninterruptible()) / grq.noc); +} + +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *this = this_rq(); + + *nr_waiters = atomic_read(&this->nr_iowait); + *load = cpu_load(this); } /* Variables and functions for calc_load */ @@ -2598,7 +2616,12 @@ static u64 do_task_delta_exec(struct tas { u64 ns = 0; - if (p == rq->curr) { + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (p == rq->curr && p->on_rq) { update_clocks(rq); ns = rq->clock_task - rq->rq_last_ran; if (unlikely((s64)ns < 0)) @@ -2633,6 +2656,22 @@ unsigned long long do_task_sched_runtime struct rq *rq; u64 ns; +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) + /* + * 64-bit doesn't need locks to atomically read a 64bit value. + * So we have a optimisation chance when the task's delta_exec is 0. + * Reading ->on_cpu is racy, but this is ok. + * + * If we race with it leaving cpu, we'll take a lock. So we're correct. + * If we race with it entering cpu, unaccounted time is 0. This is + * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. + */ + if (!p->on_cpu || !p->on_rq) + return p->sched_time; +#endif + rq = task_rq(p); ns = p->sched_time + do_task_delta_exec(p,rq); @@ -2660,8 +2699,10 @@ unsigned long long task_sched_runtime(st * If we race with it leaving cpu, we'll take a lock. So we're correct. * If we race with it entering cpu, unaccounted time is 0. This is * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. */ - if (!p->on_cpu) + if (!p->on_cpu || !p->on_rq) return tsk_seruntime(p); #endif @@ -3437,6 +3478,8 @@ need_resched: prev->state = TASK_RUNNING; } else { deactivate = true; + prev->on_rq = 0; + /* * If a worker is going to sleep, notify and * ask workqueue whether it wants to wake up a @@ -4254,6 +4297,12 @@ asmlinkage long sys_sched_setscheduler(p return do_sched_setscheduler(pid, policy, param); } +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + /** * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. @@ -4263,7 +4312,7 @@ asmlinkage long sys_sched_setscheduler(p */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { - return do_sched_setscheduler(pid, -1, param); + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); } /** @@ -4785,7 +4834,7 @@ int __sched yield_to(struct task_struct if (p->time_slice > timeslice()) p->time_slice = timeslice(); if (preempt && rq != rq) - resched_task(p_rq->curr); + resched_curr(p_rq); out_unlock: grq_unlock_irqrestore(&flags); @@ -6468,6 +6517,20 @@ struct sched_domain *build_sched_domain( sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; sd->child = child; + + if (!cpumask_subset(sched_domain_span(child), + sched_domain_span(sd))) { + pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain not a subset of the %s domain\n", + child->name, sd->name); +#endif + /* Fixup, ensure @sd has at least @child cpus. */ + cpumask_or(sched_domain_span(sd), + sched_domain_span(sd), + sched_domain_span(child)); + } + } set_domain_attribute(sd, attr);