Index: linux-4.7-bfs504/kernel/sched/bfs.c =================================================================== --- linux-4.7-bfs504.orig/kernel/sched/bfs.c 2016-09-27 14:28:18.223173282 +1000 +++ linux-4.7-bfs504/kernel/sched/bfs.c 2016-10-02 03:30:00.888158740 +1100 @@ -115,7 +115,7 @@ #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) +#define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1) #define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) #define STOP_PRIO (MAX_RT_PRIO - 1) @@ -137,7 +137,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "BFS CPU scheduler v0.502 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.102 by Con Kolivas.\n"); } /* @@ -174,30 +174,21 @@ static inline int timeslice(void) } /* - * The global runqueue data that all CPUs work off. Data is protected either - * by the global grq lock, or the discrete lock that precedes the data in this - * struct. + * The global runqueue data that all CPUs work off. Contains either atomic + * variables or iso variables protected by iso_lock. */ struct global_rq { - raw_spinlock_t lock; - unsigned long nr_running; - unsigned long nr_uninterruptible; - unsigned long long nr_switches; - unsigned long qnr; /* queued not running */ + atomic_t nr_running; + atomic_t nr_uninterruptible; + atomic64_t nr_switches; + atomic_t qnr; /* queued not running */ #ifdef CONFIG_SMP cpumask_t cpu_idle_map; bool idle_cpus; #endif - int noc; /* num_online_cpus stored and updated when it changes */ - u64 niffies; /* Nanosecond jiffies */ - unsigned long last_jiffy; /* Last jiffy we updated niffies */ - raw_spinlock_t iso_lock; int iso_ticks; bool iso_refractory; - - skiplist_node node; - skiplist *sl; }; #ifdef CONFIG_SMP @@ -296,10 +287,16 @@ static inline int cpu_of(struct rq *rq) { return rq->cpu; } +#else /* CONFIG_SMP */ +static inline int cpu_of(struct rq *rq) +{ + return 0; +} +#endif /* * Niffies are a globally increasing nanosecond counter. Whenever a runqueue - * clock is updated with the grq.lock held, it is an opportunity to update the + * clock is updated with the rq->lock held, it is an opportunity to update the * niffies value. Any CPU can update it by adding how much its clock has * increased since it last updated niffies, minus any added niffies by other * CPUs. @@ -311,35 +308,12 @@ static inline void update_clocks(struct update_rq_clock(rq); ndiff = rq->clock - rq->old_clock; - /* old_clock is only updated when we are updating niffies */ - rq->old_clock = rq->clock; - ndiff -= grq.niffies - rq->last_niffy; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; - rq->last_niffy = grq.niffies; -} -#else /* CONFIG_SMP */ -static inline int cpu_of(struct rq *rq) -{ - return 0; -} - -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; rq->old_clock = rq->clock; - jdiff = jiffies - grq.last_jiffy; + jdiff = jiffies - rq->last_jiffy; niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; + rq->last_jiffy += jdiff; + rq->niffies += ndiff; } -#endif #include "stats.h" @@ -354,10 +328,10 @@ static inline void update_clocks(struct #endif /* - * All common locking functions performed on grq.lock. rq->clock is local to + * All common locking functions performed on rq->lock. rq->clock is local to * the CPU accessing it so it can be modified just with interrupts disabled * when we're not updating niffies. - * Looking up task_rq must be done under grq.lock to be safe. + * Looking up task_rq must be done under rq->lock to be safe. */ static void update_rq_clock_task(struct rq *rq, s64 delta); @@ -376,129 +350,246 @@ static inline bool task_running(struct t return p->on_cpu; } -static inline void grq_lock(void) - __acquires(grq.lock) +static inline void rq_lock(struct rq *rq) + __acquires(rq->lock) { - raw_spin_lock(&grq.lock); + raw_spin_lock(&rq->lock); } -static inline void grq_unlock(void) - __releases(grq.lock) +static inline int rq_trylock(struct rq *rq) + __acquires(rq->lock) { - raw_spin_unlock(&grq.lock); + return raw_spin_trylock(&rq->lock); } -static inline void grq_lock_irq(void) - __acquires(grq.lock) +static inline void rq_unlock(struct rq *rq) + __releases(rq->lock) { - raw_spin_lock_irq(&grq.lock); + raw_spin_unlock(&rq->lock); } -static inline void time_lock_grq(struct rq *rq) - __acquires(grq.lock) +static inline struct rq *this_rq_lock(void) + __acquires(rq->lock) { - grq_lock(); - update_clocks(rq); + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + raw_spin_lock(&rq->lock); + + return rq; } -static inline void grq_unlock_irq(void) - __releases(grq.lock) +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ + +/* For when we know rq1 != rq2 */ +static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) { - raw_spin_unlock_irq(&grq.lock); + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } } -static inline void grq_lock_irqsave(unsigned long *flags) - __acquires(grq.lock) +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) { - raw_spin_lock_irqsave(&grq.lock, *flags); + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else + __double_rq_lock(rq1, rq2); } -static inline void grq_unlock_irqrestore(unsigned long *flags) - __releases(grq.lock) +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) { - raw_spin_unlock_irqrestore(&grq.lock, *flags); + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); } -static inline struct rq -*task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(grq.lock) +/* Must be sure rq1 != rq2 and irqs are disabled */ +static inline void lock_second_rq(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __acquires(rq1->lock) + __acquires(rq2->lock) { - grq_lock_irqsave(flags); - return task_rq(p); + BUG_ON(!irqs_disabled()); + if (unlikely(!raw_spin_trylock(&rq2->lock))) { + raw_spin_unlock(&rq1->lock); + __double_rq_lock(rq1, rq2); + } } -static inline struct rq -*time_task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(grq.lock) +static inline void lock_all_rqs(void) { - struct rq *rq = task_grq_lock(p, flags); - update_clocks(rq); - return rq; + int cpu; + + preempt_disable(); + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_lock(&rq->lock); + } } -static inline struct rq *task_grq_lock_irq(struct task_struct *p) - __acquires(grq.lock) +static inline void unlock_all_rqs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_unlock(&rq->lock); + } + preempt_enable(); +} + +/* + * Lock this_rq and as many rqs as we can grab with trylock, returning which + * rqs are locked in a bitmask. + */ +static inline void lock_rqs(struct rq *this_rq, cpumask_t *mask) { - grq_lock_irq(); - return task_rq(p); + int cpu; + + cpumask_clear(mask); + + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + if (rq != this_rq) { + if (!do_raw_spin_trylock(&rq->lock)) + continue; + spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); + } + cpumask_set_cpu(cpu, mask); + } } -static inline void time_task_grq_lock_irq(struct task_struct *p) - __acquires(grq.lock) +/* Unlock all rqs in a CPU bitmask */ +static inline void unlock_rqs(struct rq *this_rq, cpumask_t *mask) { - struct rq *rq = task_grq_lock_irq(p); + int cpu; + + cpumask_clear_cpu(this_rq->cpu, mask); + + for_each_cpu(cpu, mask) { + struct rq *rq = cpu_rq(cpu); + + spin_release(&rq->lock.dep_map, 1, _RET_IP_); + do_raw_spin_unlock(&rq->lock); + } +} + +static inline void rq_lock_irq(struct rq *rq) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); +} + +static inline void time_lock_rq(struct rq *rq) +{ + rq_lock(rq); update_clocks(rq); } -static inline void task_grq_unlock_irq(void) - __releases(grq.lock) +static inline void rq_unlock_irq(struct rq *rq) + __releases(rq->lock) { - grq_unlock_irq(); + raw_spin_unlock_irq(&rq->lock); } -static inline void task_grq_unlock(unsigned long *flags) - __releases(grq.lock) +static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) + __acquires(rq->lock) { - grq_unlock_irqrestore(flags); + raw_spin_lock_irqsave(&rq->lock, *flags); } -/** - * grunqueue_is_locked - * - * Returns true if the global runqueue is locked. - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -bool grunqueue_is_locked(void) +static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) + __releases(rq->lock) { - return raw_spin_is_locked(&grq.lock); + raw_spin_unlock_irqrestore(&rq->lock, *flags); } -void grq_unlock_wait(void) - __releases(grq.lock) +static inline struct rq +*task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) { - smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - raw_spin_unlock_wait(&grq.lock); + struct rq *rq; + + while (42) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + } + return rq; } -static inline void time_grq_lock(struct rq *rq, unsigned long *flags) - __acquires(grq.lock) +static inline struct rq +*time_task_rq_lock(struct task_struct *p, unsigned long *flags) { - local_irq_save(*flags); - time_lock_grq(rq); + struct rq *rq = task_rq_lock(p, flags); + + update_clocks(rq); + return rq; } -static inline struct rq *__task_grq_lock(struct task_struct *p) - __acquires(grq.lock) +static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) { - grq_lock(); - return task_rq(p); + rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } -static inline void __task_grq_unlock(void) - __releases(grq.lock) +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) { - grq_unlock(); + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + while (42) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + } + return rq; +} + +static inline void __task_rq_unlock(struct rq *rq) +{ + rq_unlock(rq); } static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) @@ -509,16 +600,16 @@ static inline void finish_lock_switch(st { #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - grq.lock.owner = current; + rq->lock.owner = current; #endif /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - grq_unlock_irq(); + raw_spin_unlock_irq(&rq->lock); } static inline bool deadline_before(u64 deadline, u64 time) @@ -532,6 +623,40 @@ static inline bool deadline_after(u64 de } /* + * Deadline is "now" in niffies + (offset by priority). Setting the deadline + * is the key to everything. It distributes cpu fairly amongst tasks of the + * same nice value, it proportions cpu according to nice level, it means the + * task that last woke up the longest ago has the earliest deadline, thus + * ensuring that interactive tasks get low latency on wake up. The CPU + * proportion works out to the square of the virtual deadline difference, so + * this equation will give nice 19 3% CPU compared to nice 0. + */ +static inline u64 prio_deadline_diff(int user_prio) +{ + return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); +} + +static inline u64 task_deadline_diff(struct task_struct *p) +{ + return prio_deadline_diff(TASK_USER_PRIO(p)); +} + +static inline u64 static_deadline_diff(int static_prio) +{ + return prio_deadline_diff(USER_PRIO(static_prio)); +} + +static inline int longest_deadline_diff(void) +{ + return prio_deadline_diff(39); +} + +static inline int ms_longest_deadline_diff(void) +{ + return NS_TO_MS(longest_deadline_diff()); +} + +/* * A task that is not running or queued will not have a node set. * A task that is queued but not running will have a node set. * A task that is currently running will have ->on_cpu set but no node set. @@ -541,17 +666,53 @@ static inline bool task_queued(struct ta return !skiplist_node_empty(&p->node); } +static unsigned long rq_load_avg(struct rq *rq) +{ + return rq->sl->entries * SCHED_CAPACITY_SCALE; +} + /* - * Removing from the global runqueue. Enter with grq locked. Deleting a task + * Update the load average for feeding into cpu frequency governors. Use a + * rough estimate of a rolling average with ~ time constant of 32ms. + * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 + */ +static void update_load_avg(struct rq *rq) +{ + /* rq clock can go backwards so skip update if that happens */ + if (likely(rq->clock > rq->load_update)) { + unsigned long us_interval = (rq->clock - rq->load_update) >> 10; + long load; + + load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); + if (unlikely(load < 0)) + load = 0; + load += rq->sl->entries * rq_load_avg(rq) * us_interval * 5 / 262144; + rq->load_avg = load; + } + rq->load_update = rq->clock; +} + +/* + * Removing from the runqueue. Enter with rq locked. Deleting a task * from the skip list is done via the stored node reference in the task struct * and does not require a full look up. Thus it occurs in O(k) time where k - * is the "level" of the list the task was stored at - usually < 4, max 16. + * is the "level" of the list the task was stored at - usually < 4, max 8. */ -static void dequeue_task(struct task_struct *p) +static void dequeue_task(struct task_struct *p, struct rq *rq) { - skiplist_delete(grq.sl, &p->node); + skiplist_delete(rq->sl, &p->node); sched_info_dequeued(task_rq(p), p); + update_load_avg(rq); +} + +#ifdef CONFIG_PREEMPT_RCU +static bool rcu_read_critical(struct task_struct *p) +{ + return p->rcu_read_unlock_special.b.blocked; } +#else /* CONFIG_PREEMPT_RCU */ +#define rcu_read_critical(p) (false) +#endif /* CONFIG_PREEMPT_RCU */ /* * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as @@ -559,8 +720,8 @@ static void dequeue_task(struct task_str */ static bool idleprio_suitable(struct task_struct *p) { - return (!freezing(p) && !signal_pending(p) && - !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); + return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && + !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); } /* @@ -573,7 +734,18 @@ static bool isoprio_suitable(void) } /* - * Adding to the global runqueue. Enter with grq locked. + * Check to see if p can run on cpu, and if not, whether there are any online + * CPUs it can run on instead. + */ +static inline bool needs_other_cpu(struct task_struct *p, int cpu) +{ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) + return true; + return false; +} + +/* + * Adding to the runqueue. Enter with rq locked. */ static void enqueue_task(struct task_struct *p, struct rq *rq) { @@ -604,17 +776,22 @@ static void enqueue_task(struct task_str sl_id = p->prio; else { sl_id = p->deadline; - /* Set it to cope with 4 left shifts with locality_diff */ - if (p->prio == IDLE_PRIO) - sl_id |= 0x0F00000000000000; + if (idleprio_task(p)) { + /* Set it to cope with 4 left shifts with locality_diff */ + if (p->prio == IDLE_PRIO) + sl_id |= 0x00FF000000000000; + else + sl_id += longest_deadline_diff(); + } } /* * Some architectures don't have better than microsecond resolution * so mask out ~microseconds as the random seed for skiplist insertion. */ - randseed = (grq.niffies >> 10) & 0xFFFFFFFF; - skiplist_insert(grq.sl, &p->node, sl_id, p, randseed); + randseed = (rq->niffies >> 10) & 0xFFFFFFFF; + skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); sched_info_queued(rq, p); + update_load_avg(rq); } static inline void requeue_task(struct task_struct *p) @@ -643,7 +820,7 @@ static inline int task_timeslice(struct static void resched_task(struct task_struct *p); -static inline void resched_curr(struct rq *rq) +static void resched_curr(struct rq *rq) { resched_task(rq->curr); } @@ -655,22 +832,17 @@ static inline void resched_curr(struct r */ static inline void inc_qnr(void) { - grq.qnr++; + atomic_inc(&grq.qnr); } static inline void dec_qnr(void) { - grq.qnr--; + atomic_dec(&grq.qnr); } static inline int queued_notrunning(void) { - return grq.qnr; -} - -static unsigned long rq_load_avg(struct rq *rq) -{ - return rq->soft_affined * SCHED_CAPACITY_SCALE; + return atomic_read(&grq.qnr); } #ifdef CONFIG_SMT_NICE @@ -749,20 +921,33 @@ static bool smt_should_schedule(struct t #define smt_schedule(p, this_rq) (true) #endif /* CONFIG_SMT_NICE */ #ifdef CONFIG_SMP + +static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) +{ + set_bit(cpu, (volatile unsigned long *)cpumask); +} + /* * The cpu_idle_map stores a bitmap of all the CPUs currently idle to * allow easy lookup of whether any suitable idle CPUs are available. * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the - * idle_cpus variable than to do a full bitmask check when we are busy. + * idle_cpus variable than to do a full bitmask check when we are busy. The + * bits are set atomically but read locklessly as occasional false positive / + * negative is harmless. */ static inline void set_cpuidle_map(int cpu) { if (likely(cpu_online(cpu))) { - cpumask_set_cpu(cpu, &grq.cpu_idle_map); + atomic_set_cpu(cpu, &grq.cpu_idle_map); grq.idle_cpus = true; } } +static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) +{ + clear_bit(cpu, (volatile unsigned long *)cpumask); +} + static inline void clear_cpuidle_map(int cpu) { cpumask_clear_cpu(cpu, &grq.cpu_idle_map); @@ -932,28 +1117,7 @@ static int effective_prio(struct task_st } /* - * Update the load average for feeding into cpu frequency governors. Use a - * rough estimate of a rolling average with ~ time constant of 32ms. - * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 - */ -static void update_load_avg(struct rq *rq) -{ - /* rq clock can go backwards so skip update if that happens */ - if (likely(rq->clock > rq->load_update)) { - unsigned long us_interval = (rq->clock - rq->load_update) >> 10; - long load; - - load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); - if (unlikely(load < 0)) - load = 0; - load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144; - rq->load_avg = load; - } - rq->load_update = rq->clock; -} - -/* - * activate_task - move a task to the runqueue. Enter with grq locked. + * activate_task - move a task to the runqueue. Enter with rq locked. */ static void activate_task(struct task_struct *p, struct rq *rq) { @@ -972,79 +1136,80 @@ static void activate_task(struct task_st p->prio = effective_prio(p); if (task_contributes_to_load(p)) - grq.nr_uninterruptible--; + atomic_dec(&grq.nr_uninterruptible); enqueue_task(p, rq); - rq->soft_affined++; p->on_rq = 1; - grq.nr_running++; + atomic_inc(&grq.nr_running); inc_qnr(); update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); + cpufreq_trigger(rq->niffies, rq->load_avg); } /* - * deactivate_task - If it's running, it's not on the grq and we can just - * decrement the nr_running. Enter with grq locked. + * deactivate_task - If it's running, it's not on the runqueue and we can just + * decrement the nr_running. Enter with rq locked. */ static inline void deactivate_task(struct task_struct *p, struct rq *rq) { if (task_contributes_to_load(p)) - grq.nr_uninterruptible++; - rq->soft_affined--; + atomic_inc(&grq.nr_uninterruptible); + p->on_rq = 0; - grq.nr_running--; + atomic_dec(&grq.nr_running); update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); + cpufreq_trigger(rq->niffies, rq->load_avg); } #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int cpu) { - unsigned int tcpu; + struct rq *rq = task_rq(p); + bool queued; #ifdef CONFIG_LOCKDEP /* - * The caller should hold grq lock. + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). */ - WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock)); + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); #endif - if ((tcpu = task_cpu(p)) == cpu) + if (task_cpu(p) == cpu) return; trace_sched_migrate_task(p, cpu); perf_event_task_migrate(p); /* - * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); - if (p->on_rq) { - struct rq *rq = task_rq(p); - rq->soft_affined--; - update_load_avg(rq); - rq = cpu_rq(cpu); - rq->soft_affined++; - update_load_avg(rq); - } + if ((queued = task_queued(p))) + dequeue_task(p, rq); task_thread_info(p)->cpu = cpu; + if (queued) + enqueue_task(p, cpu_rq(cpu)); } #endif /* CONFIG_SMP */ /* - * Move a task off the global queue and take it to a cpu for it will + * Move a task off the runqueue and take it to a cpu for it will * become the running task. */ -static inline void take_task(int cpu, struct task_struct *p) +static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) { + dequeue_task(p, task_rq(p)); set_task_cpu(p, cpu); - dequeue_task(p); dec_qnr(); } /* - * Returns a descheduling task to the grq runqueue unless it is being + * Returns a descheduling task to the runqueue unless it is being * deactivated. */ static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate) @@ -1057,7 +1222,7 @@ static inline void return_task(struct ta } } -/* Enter with grq lock held. We know p is on the local cpu */ +/* Enter with rq lock held. We know p is on the local cpu */ static inline void __set_tsk_resched(struct task_struct *p) { set_tsk_need_resched(p); @@ -1075,11 +1240,10 @@ void resched_task(struct task_struct *p) { int cpu; - lockdep_assert_held(&grq.lock); - if (test_tsk_need_resched(p)) return; + /* We're doing this without holding the rq lock if it's not task_rq */ set_tsk_need_resched(p); cpu = task_cpu(p); @@ -1151,14 +1315,14 @@ unsigned long wait_task_inactive(struct * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(p); on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_grq_unlock(&flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -1271,17 +1435,6 @@ static inline bool online_cpus(struct ta } #endif -/* - * Check to see if p can run on cpu, and if not, whether there are any online - * CPUs it can run on instead. - */ -static inline bool needs_other_cpu(struct task_struct *p, int cpu) -{ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) - return true; - return false; -} - static void try_preempt(struct task_struct *p, struct rq *this_rq) { int cpu, pcpu, highest_prio, highest_cpu; @@ -1307,6 +1460,8 @@ static void try_preempt(struct task_stru } cpumask_clear_cpu(pcpu, &tmp); } + if (!sched_interactive) + return; highest_prio = latest_deadline = 0; highest_prio_rq = NULL; @@ -1315,19 +1470,15 @@ static void try_preempt(struct task_stru for_each_cpu(cpu, &tmp) { struct rq *rq; int rq_prio; - u64 dl; rq = cpu_rq(cpu); rq_prio = rq->rq_prio; if (rq_prio < highest_prio) continue; - dl = rq->rq_deadline; - if (!sched_interactive && pcpu != cpu) - dl <<= locality_diff(pcpu, rq); if (rq_prio > highest_prio || - deadline_after(dl, latest_deadline)) { - latest_deadline = dl; + deadline_after(rq->rq_deadline, latest_deadline)) { + latest_deadline = rq->rq_deadline; highest_prio = rq_prio; highest_cpu = cpu; highest_prio_rq = rq; @@ -1338,15 +1489,8 @@ static void try_preempt(struct task_stru return; if (!smt_schedule(p, highest_prio_rq)) return; - if (can_preempt(p, highest_prio, latest_deadline)) { - /* - * If we have decided this task should preempt this CPU, - * set the task's CPU to match thereby speeding up matching - * this task in earliest_deadline_task. - */ - set_task_cpu(p, highest_cpu); + if (can_preempt(p, highest_prio, latest_deadline)) resched_curr(highest_prio_rq); - } } static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, bool check); @@ -1411,11 +1555,11 @@ void wake_up_if_idle(int cpu) if (!is_idle_task(rcu_dereference(rq->curr))) goto out; - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); if (likely(is_idle_task(rq->curr))) smp_send_reschedule(cpu); /* Else cpu is not in idle, do nothing here */ - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); out: rcu_read_unlock(); @@ -1493,8 +1637,6 @@ static bool try_to_wake_up(struct task_s struct rq *rq; int cpu; - get_cpu(); - /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be @@ -1507,7 +1649,7 @@ static bool try_to_wake_up(struct task_s * No need to do time_lock_grq as we only need to update the rq clock * if we activate the task */ - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); cpu = task_cpu(p); /* state is a volatile long, どうして、分からない */ @@ -1525,13 +1667,11 @@ static bool try_to_wake_up(struct task_s out_running: ttwu_post_activation(p, rq, success); out_unlock: - task_grq_unlock(&flags); + task_rq_unlock(rq, p, &flags); if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); - put_cpu(); - return success; } @@ -1548,10 +1688,26 @@ static void try_to_wake_up_local(struct struct rq *rq = task_rq(p); bool success = false; - lockdep_assert_held(&grq.lock); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + + lockdep_assert_held(&rq->lock); + + if (!raw_spin_trylock(&p->pi_lock)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet picked a replacement task. + */ + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } if (!(p->state & TASK_NORMAL)) - return; + goto out; trace_sched_waking(p); @@ -1566,6 +1722,8 @@ static void try_to_wake_up_local(struct success = true; } ttwu_post_activation(p, rq, success); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -1591,7 +1749,7 @@ int wake_up_state(struct task_struct *p, return try_to_wake_up(p, state, 0); } -static void time_slice_expired(struct task_struct *p); +static void time_slice_expired(struct task_struct *p, struct rq *rq); /* * Perform scheduler related setup for a newly forked process p. @@ -1599,6 +1757,9 @@ static void time_slice_expired(struct ta */ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) { + unsigned long flags; + int cpu = get_cpu(); + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif @@ -1641,12 +1802,21 @@ int sched_fork(unsigned long __maybe_unu p->sched_reset_on_fork = 0; } + /* + * Silence PROVE_RCU. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + #ifdef CONFIG_SCHED_INFO if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif p->on_cpu = false; init_task_preempt_count(p); + + put_cpu(); return 0; } @@ -1736,12 +1906,17 @@ static inline void init_schedstats(void) */ void wake_up_new_task(struct task_struct *p) { - struct task_struct *parent; + struct task_struct *parent, *rq_curr; unsigned long flags; struct rq *rq; parent = p->parent; - rq = task_grq_lock(p, &flags); + + raw_spin_lock_irqsave(&p->pi_lock, flags); + if (unlikely(needs_other_cpu(p, task_cpu(p)))) + set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); + rq = __task_rq_lock(p); + rq_curr = rq->curr; /* * Reinit new task deadline as its creator deadline could have changed @@ -1750,21 +1925,12 @@ void wake_up_new_task(struct task_struct p->deadline = rq->rq_deadline; /* - * If the task is a new process, current and parent are the same. If - * the task is a new thread in the thread group, it will have much more - * in common with current than with the parent. - */ - set_task_cpu(p, task_cpu(rq->curr)); - - /* * Make sure we do not leak PI boosting priority to the child. */ - p->prio = rq->curr->normal_prio; + p->prio = rq_curr->normal_prio; activate_task(p, rq); trace_sched_wakeup_new(p); - if (unlikely(p->policy == SCHED_FIFO)) - goto after_ts_init; /* * Share the timeslice between parent and child, thus the @@ -1776,33 +1942,37 @@ void wake_up_new_task(struct task_struct * is always equal to current->deadline. */ p->last_ran = rq->rq_last_ran; - if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { + if (likely(rq_curr->policy != SCHED_FIFO)) { rq->rq_time_slice /= 2; - p->time_slice = rq->rq_time_slice; -after_ts_init: - if (rq->curr == parent && !suitable_idle_cpus(p)) { + if (unlikely(rq->rq_time_slice < RESCHED_US)) { /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. + * Forking task has run out of timeslice. Reschedule it and + * start its child with a new time slice and deadline. The + * child will end up running first because its deadline will + * be slightly earlier. */ - __set_tsk_resched(parent); - } else - try_preempt(p, rq); - } else { - if (rq->curr == parent) { - /* - * Forking task has run out of timeslice. Reschedule it and - * start its child with a new time slice and deadline. The - * child will end up running first because its deadline will - * be slightly earlier. - */ rq->rq_time_slice = 0; - __set_tsk_resched(parent); + __set_tsk_resched(rq_curr); + time_slice_expired(p, rq); + if (suitable_idle_cpus(p)) + resched_best_idle(p); + } else { + p->time_slice = rq->rq_time_slice; + if (rq_curr == parent && !suitable_idle_cpus(p)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + __set_tsk_resched(rq_curr); + } else + try_preempt(p, rq); } - time_slice_expired(p); + } else { + time_slice_expired(p, rq); + try_preempt(p, rq); } - task_grq_unlock(&flags); + task_rq_unlock(rq, p, &flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1936,7 +2106,7 @@ prepare_task_switch(struct rq *rq, struc * because prev may have moved to another CPU. */ static struct rq *finish_task_switch(struct task_struct *prev) - __releases(grq.lock) + __releases(rq->lock) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -1996,7 +2166,7 @@ static struct rq *finish_task_switch(str * @prev: the thread we just switched away from. */ asmlinkage __visible void schedule_tail(struct task_struct *prev) - __releases(grq.lock) + __releases(rq->lock) { struct rq *rq; @@ -2053,7 +2223,7 @@ context_switch(struct rq *rq, struct tas * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2066,26 +2236,16 @@ context_switch(struct rq *rq, struct tas * nr_running, nr_uninterruptible and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. All are - * measured without grabbing the grq lock but the occasional inaccurate result - * doesn't matter so long as it's positive. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { - long nr = grq.nr_running; - - if (unlikely(nr < 0)) - nr = 0; - return (unsigned long)nr; + return atomic_read(&grq.nr_running); } static unsigned long nr_uninterruptible(void) { - long nu = grq.nr_uninterruptible; - - if (unlikely(nu < 0)) - nu = 0; - return nu; + return atomic_read(&grq.nr_uninterruptible); } /* @@ -2103,7 +2263,7 @@ static unsigned long nr_uninterruptible( */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->soft_affined == 1) + if (cpu_rq(smp_processor_id())->sl->entries == 1) return true; else return false; @@ -2112,12 +2272,7 @@ EXPORT_SYMBOL(single_task_running); unsigned long long nr_context_switches(void) { - long long ns = grq.nr_switches; - - /* This is of course impossible */ - if (unlikely(ns < 0)) - ns = 1; - return (unsigned long long)ns; + return (unsigned long long)atomic64_read(&grq.nr_switches); } unsigned long nr_iowait(void) @@ -2149,7 +2304,7 @@ void get_iowait_load(unsigned long *nr_w struct rq *rq = this_rq(); *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->soft_affined; + *load = rq->sl->entries; } /* Variables and functions for calc_load */ @@ -2665,7 +2820,7 @@ ts_account: * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. * - * Called with task_grq_lock() held. + * Called with task_rq_lock(p) held. */ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) { @@ -2714,9 +2869,9 @@ unsigned long long task_sched_runtime(st return tsk_seruntime(p); #endif - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); ns = p->sched_time + do_task_delta_exec(p, rq); - task_grq_unlock(&flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -2965,19 +3120,17 @@ static void task_running_tick(struct rq } else if (rq->rq_time_slice >= RESCHED_US) return; - /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ p = rq->curr; - grq_lock(); + rq_lock(rq); requeue_task(p); __set_tsk_resched(p); - grq_unlock(); + rq_unlock(rq); } /* * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. The data modified is all - * local to struct rq so we don't need to grab grq lock. + * We call it with interrupts disabled. */ void scheduler_tick(void) { @@ -2985,11 +3138,10 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); sched_clock_tick(); - /* grq lock not grabbed, so only update rq clock */ update_rq_clock(rq); update_cpu_clock_tick(rq, rq->curr); update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); + cpufreq_trigger(rq->niffies, rq->load_avg); if (!rq_idle(rq)) task_running_tick(rq); else @@ -3075,47 +3227,13 @@ static inline void preempt_latency_stop( #endif /* - * Deadline is "now" in niffies + (offset by priority). Setting the deadline - * is the key to everything. It distributes cpu fairly amongst tasks of the - * same nice value, it proportions cpu according to nice level, it means the - * task that last woke up the longest ago has the earliest deadline, thus - * ensuring that interactive tasks get low latency on wake up. The CPU - * proportion works out to the square of the virtual deadline difference, so - * this equation will give nice 19 3% CPU compared to nice 0. - */ -static inline u64 prio_deadline_diff(int user_prio) -{ - return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -} - -static inline u64 task_deadline_diff(struct task_struct *p) -{ - return prio_deadline_diff(TASK_USER_PRIO(p)); -} - -static inline u64 static_deadline_diff(int static_prio) -{ - return prio_deadline_diff(USER_PRIO(static_prio)); -} - -static inline int longest_deadline_diff(void) -{ - return prio_deadline_diff(39); -} - -static inline int ms_longest_deadline_diff(void) -{ - return NS_TO_MS(longest_deadline_diff()); -} - -/* * The time_slice is only refilled when it is empty and that is when we set a * new deadline. */ -static void time_slice_expired(struct task_struct *p) +static void time_slice_expired(struct task_struct *p, struct rq *rq) { p->time_slice = timeslice(); - p->deadline = grq.niffies + task_deadline_diff(p); + p->deadline = rq->niffies + task_deadline_diff(p); #ifdef CONFIG_SMT_NICE if (!p->mm) p->smt_bias = 0; @@ -3142,10 +3260,10 @@ static void time_slice_expired(struct ta * SCHED_NORMAL tasks. */ -static inline void check_deadline(struct task_struct *p) +static inline void check_deadline(struct task_struct *p, struct rq *rq) { if (p->time_slice < RESCHED_US || batch_task(p)) - time_slice_expired(p); + time_slice_expired(p, rq); } #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) @@ -3202,46 +3320,60 @@ found_middle: * task in the sorted list, an O(1) operation. The only time it takes longer * is if tasks do not have suitable affinity and then we iterate over entries * till we find the first that does. Worst case here is no tasks with suitable - * affinity and taking O(n). + * affinity and taking O(k) where k is number of processors. + * + * As many runqueues as can be locked without contention are grabbed via + * lock_rqs and only those runqueues are examined. All balancing between CPUs + * is thus done here in an extremely simple first come best fit manner. + * + * This iterates over runqueues in cache locality order. In interactive mode + * it iterates over all CPUs and finds the task with the earliest deadline. + * In non-interactive mode it grabs the first task it finds, being the closest + * to the current CPU in cache locality. */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { struct task_struct *edt = idle; - skiplist_node *node = &grq.node; u64 earliest_deadline = ~0ULL; + cpumask_t locked; + int i; - while ((node = node->next[0]) != &grq.node) { - struct task_struct *p = node->value; - int tcpu; + lock_rqs(rq, &locked); - /* Make sure affinity is ok */ - if (needs_other_cpu(p, cpu)) + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *other_rq = rq->rq_order[i]; + struct task_struct *p; + skiplist_node *node; + + if (!cpumask_test_cpu(other_rq->cpu, &locked)) continue; + if ((node = other_rq->node.next[0]) == &other_rq->node) + continue; + p = node->value; if (!smt_schedule(p, rq)) continue; - if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) { - u64 dl = p->deadline << locality_diff(tcpu, rq); + /* Make sure affinity is ok */ + if (rq != other_rq && needs_other_cpu(p, cpu)) + continue; - if (unlikely(!deadline_before(dl, earliest_deadline))) - continue; - earliest_deadline = dl; + if (!sched_interactive) { edt = p; - /* We continue even though we've found the earliest - * deadline task as the locality offset means there - * may be a better candidate after it. */ - continue; + break; } - /* This wouldn't happen if we encountered a better deadline from - * another CPU and have already set edt. */ - if (likely(p->deadline < earliest_deadline)) - edt = p; - break; + + if (!deadline_before(p->deadline, earliest_deadline)) + continue; + earliest_deadline = p->deadline; + edt = p; } + if (likely(edt != idle)) - take_task(cpu, edt); + take_task(rq, cpu, edt); + unlock_rqs(rq, &locked); + return edt; } @@ -3294,8 +3426,7 @@ static inline void schedule_debug(struct /* * The currently running task's information is all stored in rq local data - * which is only modified by the local CPU, thereby allowing the data to be - * changed without grabbing the grq lock. + * which is only modified by the local CPU. */ static inline void set_rq_task(struct rq *rq, struct task_struct *p) { @@ -3451,7 +3582,7 @@ static void __sched notrace __schedule(b * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - grq_lock(); + rq_lock(rq); switch_count = &prev->nivcsw; if (!preempt && prev->state) { @@ -3498,7 +3629,7 @@ static void __sched notrace __schedule(b /* Update all the information stored on struct rq */ prev->time_slice = rq->rq_time_slice; prev->deadline = rq->rq_deadline; - check_deadline(prev); + check_deadline(prev, rq); prev->last_ran = rq->clock_task; return_task(prev, rq, deactivate); } @@ -3530,19 +3661,17 @@ static void __sched notrace __schedule(b check_siblings(rq); else wake_siblings(rq); - grq.nr_switches++; + atomic64_inc(&grq.nr_switches); prev->on_cpu = false; next->on_cpu = true; rq->curr = next; ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the grq */ - cpu = cpu_of(rq); - idle = rq->idle; + rq = context_switch(rq, prev, next); /* unlocks the rq */ } else { check_siblings(rq); - grq_unlock_irq(); + rq_unlock_irq(rq); } } @@ -3757,13 +3886,12 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; - int queued, oldprio; struct rq *rq; + int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); - rq = task_grq_lock(p, &flags); + rq = __task_rq_lock(p); /* * Idle task boosting is a nono in general. There is one @@ -3785,19 +3913,18 @@ void rt_mutex_setprio(struct task_struct trace_sched_pi_setprio(p, prio); oldprio = p->prio; - queued = task_queued(p); - if (queued) - dequeue_task(p); p->prio = prio; - if (task_running(p) && prio > oldprio) - resched_task(p); - if (queued) { + if (task_running(p)){ + if (prio > oldprio) + resched_task(p); + } else if (task_queued(p)) { + dequeue_task(p, rq); enqueue_task(p, rq); - try_preempt(p, rq); + if (prio < oldprio) + try_preempt(p, rq); } - out_unlock: - task_grq_unlock(&flags); + __task_rq_unlock(rq); } #endif @@ -3813,7 +3940,7 @@ static inline void adjust_deadline(struc void set_user_nice(struct task_struct *p, long nice) { - int queued, new_static, old_static; + int new_static, old_static; unsigned long flags; struct rq *rq; @@ -3824,7 +3951,7 @@ void set_user_nice(struct task_struct *p * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = time_task_grq_lock(p, &flags); + rq = time_task_rq_lock(p, &flags); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3835,16 +3962,14 @@ void set_user_nice(struct task_struct *p p->static_prio = new_static; goto out_unlock; } - queued = task_queued(p); - if (queued) - dequeue_task(p); adjust_deadline(p, new_static); old_static = p->static_prio; p->static_prio = new_static; p->prio = effective_prio(p); - if (queued) { + if (task_queued(p)) { + dequeue_task(p, rq); enqueue_task(p, rq); if (new_static < old_static) try_preempt(p, rq); @@ -3854,7 +3979,7 @@ void set_user_nice(struct task_struct *p resched_task(p); } out_unlock: - task_grq_unlock(&flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -3925,7 +4050,7 @@ int task_prio(const struct task_struct * goto out; /* Convert to ms to avoid overflows */ - delta = NS_TO_MS(p->deadline - grq.niffies); + delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); delta = delta * 40 / ms_longest_deadline_diff(); if (delta > 0 && delta <= 80) prio += delta; @@ -3968,7 +4093,7 @@ static inline struct task_struct *find_p return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold grq lock. */ +/* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio, bool keep_boost) { @@ -3994,11 +4119,17 @@ static void __setscheduler(struct task_s p->prio = rt_mutex_get_effective_prio(p, p->normal_prio); } else p->prio = p->normal_prio; + if (task_running(p)) { reset_rq_task(rq, p); /* Resched only if we might now be preempted */ - if (p->prio > oldprio || p->rt_priority > oldrtprio) + if (p->prio > oldprio || p->rt_priority < oldrtprio) resched_task(p); + } else if (task_queued(p)) { + dequeue_task(p, rq); + enqueue_task(p, rq); + if (p->prio < oldprio || p->rt_priority > oldrtprio) + try_preempt(p, rq); } } @@ -4023,8 +4154,8 @@ __sched_setscheduler(struct task_struct const struct sched_param *param, bool user, bool pi) { struct sched_param zero_param = { .sched_priority = 0 }; - int queued, retval, oldpolicy = -1; unsigned long flags, rlim_rtprio = 0; + int retval, oldpolicy = -1; int reset_on_fork; struct rq *rq; @@ -4134,20 +4265,17 @@ recheck: /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* - * To be able to change p->policy safely, the grunqueue lock must be + * + * To be able to change p->policy safely, the runqueue lock must be * held. */ - rq = __task_grq_lock(p); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EINVAL; } @@ -4156,32 +4284,21 @@ recheck: */ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || param->sched_priority == p->rt_priority))) { - - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return 0; } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); goto recheck; } update_clocks(rq); p->sched_reset_on_fork = reset_on_fork; - queued = task_queued(p); - if (queued) - dequeue_task(p); __setscheduler(p, rq, policy, param->sched_priority, pi); - if (queued) { - enqueue_task(p, rq); - try_preempt(p, rq); - } - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); if (pi) rt_mutex_adjust_pi(p); @@ -4681,9 +4798,9 @@ long sched_getaffinity(pid_t pid, cpumas if (retval) goto out_unlock; - grq_lock_irqsave(&flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask); - grq_unlock_irqrestore(&flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -4740,9 +4857,10 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t SYSCALL_DEFINE0(sched_yield) { struct task_struct *p; + struct rq *rq; p = current; - grq_lock_irq(); + rq = this_rq_lock(); schedstat_inc(task_rq(p), yld_count); requeue_task(p); @@ -4750,9 +4868,9 @@ SYSCALL_DEFINE0(sched_yield) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(grq.lock); - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&grq.lock); + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + do_raw_spin_unlock(&rq->lock); sched_preempt_enable_no_resched(); schedule(); @@ -4862,14 +4980,26 @@ int __sched yield_to(struct task_struct unsigned long flags; int yielded = 0; + local_irq_save(flags); rq = this_rq(); - grq_lock_irqsave(&flags); + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ if (task_running(p) || p->state) { yielded = -ESRCH; - goto out_unlock; + goto out_irq; + } + + double_rq_lock(rq, p_rq); + if (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; } - p_rq = task_rq(p); yielded = 1; if (p->deadline > rq->rq_deadline) p->deadline = rq->rq_deadline; @@ -4878,9 +5008,10 @@ int __sched yield_to(struct task_struct if (p->time_slice > timeslice()) p->time_slice = timeslice(); if (preempt && rq != p_rq) - resched_curr(p_rq); -out_unlock: - grq_unlock_irqrestore(&flags); + resched_task(p_rq->curr); + double_rq_unlock(rq, p_rq); +out_irq: + local_irq_restore(flags); if (yielded > 0) schedule(); @@ -4986,8 +5117,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p struct task_struct *p; unsigned int time_slice; unsigned long flags; - int retval; struct timespec t; + struct rq *rq; + int retval; if (pid < 0) return -EINVAL; @@ -5002,9 +5134,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p if (retval) goto out_unlock; - grq_lock_irqsave(&flags); + rq = task_rq_lock(p, &flags); time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); - grq_unlock_irqrestore(&flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); t = ns_to_timespec(time_slice); @@ -5104,7 +5236,21 @@ void set_cpus_allowed_common(struct task void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + struct rq *rq = task_rq(p); + + lockdep_assert_held(&p->pi_lock); + cpumask_copy(tsk_cpus_allowed(p), new_mask); + + if (task_queued(p)) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + } + if (needs_other_cpu(p, task_cpu(p))) + set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); } #endif @@ -5122,7 +5268,7 @@ void init_idle(struct task_struct *idle, unsigned long flags; raw_spin_lock_irqsave(&idle->pi_lock, flags); - time_lock_grq(rq); + raw_spin_lock(&rq->lock); idle->last_ran = rq->clock_task; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ @@ -5151,7 +5297,7 @@ void init_idle(struct task_struct *idle, rq->curr = rq->idle = idle; idle->on_cpu = 1; - grq_unlock(); + raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -5237,11 +5383,12 @@ void wake_up_q(struct wake_q_head *head) void resched_cpu(int cpu) { + struct rq *rq = cpu_rq(cpu); unsigned long flags; - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); resched_task(cpu_curr(cpu)); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); } #ifdef CONFIG_SMP @@ -5368,12 +5515,13 @@ static int __set_cpus_allowed_ptr(struct { const struct cpumask *cpu_valid_mask = cpu_active_mask; bool running_wrong = false; + struct cpumask old_mask; bool queued = false; unsigned long flags; struct rq *rq; int ret = 0; - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); if (p->flags & PF_KTHREAD) { /* @@ -5391,7 +5539,8 @@ static int __set_cpus_allowed_ptr(struct goto out; } - if (cpumask_equal(tsk_cpus_allowed(p), new_mask)) + cpumask_copy(&old_mask, tsk_cpus_allowed(p)); + if (cpumask_equal(&old_mask, new_mask)) goto out; if (!cpumask_intersects(new_mask, cpu_valid_mask)) { @@ -5424,13 +5573,18 @@ static int __set_cpus_allowed_ptr(struct running_wrong = true; } else resched_task(p); - } else - set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask)); + } else { + int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + struct rq *dest_rq = cpu_rq(dest_cpu); + lock_second_rq(rq, dest_rq); + set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask)); + rq_unlock(dest_rq); + } out: - if (queued) + if (queued && !cpumask_subset(new_mask, &old_mask)) try_preempt(p, rq); - task_grq_unlock(&flags); + task_rq_unlock(rq, p, &flags); if (running_wrong) preempt_schedule_common(); @@ -5447,8 +5601,11 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static bool sched_smp_initialized __read_mostly; #ifdef CONFIG_HOTPLUG_CPU -/* Run through task list and find tasks affined to the dead cpu, then remove - * that cpu from the list, enable cpu0 and set the zerobound flag. */ +/* + * Run through task list and find tasks affined to the dead cpu, then remove + * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold + * cpu 0 and src_cpu's runqueue locks. + */ static void bind_zero(int src_cpu) { struct task_struct *p, *t; @@ -5463,6 +5620,11 @@ static void bind_zero(int src_cpu) cpumask_set_cpu(0, tsk_cpus_allowed(p)); p->zerobound = true; bound++; + if (task_cpu(p) == src_cpu) { + set_task_cpu(p, 0); + if (task_running(p)) + resched_task(p); + } } } while_each_thread(t, p); @@ -5876,7 +6038,7 @@ static void rq_attach_root(struct rq *rq struct root_domain *old_rd = NULL; unsigned long flags; - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); if (rq->rd) { old_rd = rq->rd; @@ -5902,7 +6064,7 @@ static void rq_attach_root(struct rq *rq if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); if (old_rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); @@ -6881,14 +7043,13 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } unbind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); return 0; } @@ -6936,14 +7097,15 @@ int sched_cpu_dying(unsigned int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - grq_lock_irqsave(&flags); + local_irq_save(flags); + double_rq_lock(rq, cpu_rq(0)); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } bind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); + double_rq_unlock(rq, cpu_rq(0)); + local_irq_restore(flags); return 0; } @@ -7000,8 +7162,8 @@ void __init sched_init_smp(void) #ifdef CONFIG_SCHED_SMT bool smt_threads = false; #endif - cpumask_var_t non_isolated_cpus; + struct rq *rq; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); @@ -7026,7 +7188,8 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); mutex_lock(&sched_domains_mutex); - grq_lock_irq(); + local_irq_disable(); + lock_all_rqs(); /* * Set up the relative cache distance of each online cpu from each * other in a simple array for quick lookup. Locality is determined @@ -7037,7 +7200,7 @@ void __init sched_init_smp(void) * nodes) are treated as very distant. */ for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + rq = cpu_rq(cpu); /* First check if this cpu is in the same node */ for_each_domain(cpu, sd) { @@ -7076,6 +7239,17 @@ void __init sched_init_smp(void) } #endif } + for_each_possible_cpu(cpu) { + int total_cpus = 0, locality; + + rq = cpu_rq(cpu); + for (locality = 0; locality <= 4; locality++) { + for_each_possible_cpu(other_cpu) { + if (rq->cpu_locality[other_cpu] == locality) + rq->rq_order[total_cpus++] = cpu_rq(other_cpu); + } + } + } #ifdef CONFIG_SMT_NICE if (smt_threads) { check_siblings = &check_smt_siblings; @@ -7083,11 +7257,13 @@ void __init sched_init_smp(void) smt_schedule = &smt_should_schedule; } #endif - grq_unlock_irq(); + unlock_all_rqs(); + local_irq_enable(); mutex_unlock(&sched_domains_mutex); for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + rq = cpu_rq(cpu); + for_each_online_cpu(other_cpu) { if (other_cpu <= cpu) continue; @@ -7145,21 +7321,18 @@ void __init sched_init(void) for (i = 1 ; i < NICE_WIDTH ; i++) prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; - raw_spin_lock_init(&grq.lock); - grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; - grq.niffies = 0; - grq.last_jiffy = jiffies; + atomic_set(&grq.nr_running, 0); + atomic_set(&grq.nr_uninterruptible, 0); + atomic64_set(&grq.nr_switches, 0); raw_spin_lock_init(&grq.iso_lock); grq.iso_ticks = 0; grq.iso_refractory = false; - grq.noc = 1; - skiplist_init(&grq.node); - grq.sl = new_skiplist(&grq.node); skiplist_node_init(&init_task.node); #ifdef CONFIG_SMP init_defrootdomain(); - grq.qnr = grq.idle_cpus = 0; + atomic_set(&grq.qnr, 0); + grq.idle_cpus = 0; cpumask_clear(&grq.cpu_idle_map); #else uprq = &per_cpu(runqueues, 0); @@ -7174,12 +7347,15 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { rq = cpu_rq(i); - rq->grq_lock = &grq.lock; + skiplist_init(&rq->node); + rq->sl = new_skiplist(&rq->node); + raw_spin_lock_init(&rq->lock); + rq->niffies = 0; + rq->last_jiffy = jiffies; rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = rq->iowait_pc = rq->idle_pc = 0; rq->dither = false; #ifdef CONFIG_SMP - rq->last_niffy = 0; rq->sd = NULL; rq->rd = NULL; rq->online = false; @@ -7212,6 +7388,10 @@ void __init sched_init(void) else rq->cpu_locality[j] = 4; } + rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); + rq->rq_order[0] = rq; + for (j = 1; j < cpu_ids; j++) + rq->rq_order[j] = cpu_rq(j); } #endif @@ -7315,7 +7495,6 @@ static inline void normalise_rt_tasks(vo struct task_struct *g, *p; unsigned long flags; struct rq *rq; - int queued; read_lock(&tasklist_lock); for_each_process_thread(g, p) { @@ -7328,17 +7507,9 @@ static inline void normalise_rt_tasks(vo if (!rt_task(p) && !iso_task(p)) continue; - rq = task_grq_lock(p, &flags); - queued = task_queued(p); - if (queued) - dequeue_task(p); + rq = task_rq_lock(p, &flags); __setscheduler(p, rq, SCHED_NORMAL, 0, false); - if (queued) { - enqueue_task(p, rq); - try_preempt(p, rq); - } - - task_grq_unlock(&flags); + task_rq_unlock(rq, p, &flags); } read_unlock(&tasklist_lock); } Index: linux-4.7-bfs504/include/linux/skip_lists.h =================================================================== --- linux-4.7-bfs504.orig/include/linux/skip_lists.h 2016-09-23 08:59:12.374546442 +1000 +++ linux-4.7-bfs504/include/linux/skip_lists.h 2016-09-27 14:57:30.913307009 +1000 @@ -9,8 +9,8 @@ struct nodeStructure { int level; /* Levels in this structure */ keyType key; valueType value; - skiplist_node *next[16]; - skiplist_node *prev[16]; + skiplist_node *next[8]; + skiplist_node *prev[8]; }; typedef struct listStructure { Index: linux-4.7-bfs504/kernel/sched/bfs_sched.h =================================================================== --- linux-4.7-bfs504.orig/kernel/sched/bfs_sched.h 2016-09-23 08:59:12.373546408 +1000 +++ linux-4.7-bfs504/kernel/sched/bfs_sched.h 2016-10-01 21:08:24.325453196 +1000 @@ -1,5 +1,6 @@ #include #include +#include #include #ifndef BFS_SCHED_H @@ -13,8 +14,7 @@ struct rq { struct task_struct *curr, *idle, *stop; struct mm_struct *prev_mm; - /* Pointer to grq spinlock */ - raw_spinlock_t *grq_lock; + raw_spinlock_t lock; /* Stored data about rq->curr to work outside grq lock */ u64 rq_deadline; @@ -23,7 +23,7 @@ struct rq { u64 rq_last_ran; int rq_prio; bool rq_running; /* There is a task running */ - int soft_affined; /* Running or queued tasks with this set as their rq */ + u64 load_update; /* When we last updated load */ unsigned long load_avg; /* Rolling load average */ #ifdef CONFIG_SMT_NICE @@ -36,6 +36,8 @@ struct rq { iowait_pc, idle_pc; atomic_t nr_iowait; + skiplist_node node; + skiplist *sl; #ifdef CONFIG_SMP int cpu; /* cpu of this runqueue */ bool online; @@ -43,6 +45,10 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; int *cpu_locality; /* CPU relative cache distance */ + struct rq **rq_order; /* RQs ordered by relative cache distance */ + + unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ + u64 niffies; /* Last time this RQ updated rq clock */ #ifdef CONFIG_SCHED_SMT cpumask_t thread_mask; bool (*siblings_idle)(struct rq *rq); @@ -53,7 +59,6 @@ struct rq { bool (*cache_idle)(struct rq *rq); /* See if all cache siblings are idle */ #endif /* CONFIG_SCHED_MC */ - u64 last_niffy; /* Last time this RQ updated grq.niffies */ #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -118,13 +123,13 @@ static inline u64 __rq_clock_broken(stru static inline u64 rq_clock(struct rq *rq) { - lockdep_assert_held(rq->grq_lock); + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { - lockdep_assert_held(rq->grq_lock); + lockdep_assert_held(&rq->lock); return rq->clock_task; } Index: linux-4.7-bfs504/kernel/skip_lists.c =================================================================== --- linux-4.7-bfs504.orig/kernel/skip_lists.c 2016-09-23 08:59:12.374546442 +1000 +++ linux-4.7-bfs504/kernel/skip_lists.c 2016-09-30 06:35:43.081468631 +1000 @@ -33,7 +33,7 @@ occurs in O(log n) time. delnode(slnode, l, node): deletes any binding of key from the l based on the actual node value. This operation occurs in O(k) time where k is the -number of levels of the node in question (max 16). The original delete +number of levels of the node in question (max 8). The original delete function occurred in O(log n) time and involved a search. BFS Notes: In this implementation of skiplists, there are bidirectional @@ -51,7 +51,7 @@ aid of prev<->next pointer manipulation #include #include -#define MaxNumberOfLevels 16 +#define MaxNumberOfLevels 8 #define MaxLevel (MaxNumberOfLevels - 1) void skiplist_init(skiplist_node *slnode) @@ -111,9 +111,7 @@ static inline unsigned int randomLevel(i { unsigned int mask; - if (entries > 31) - mask = 0xF; - else if (entries > 15) + if (entries > 15) mask = 0x7; else if (entries > 7) mask = 0x3; @@ -139,6 +137,8 @@ void skiplist_insert(skiplist *l, skipli } while (--k >= 0); k = randomLevel(++l->entries, randseed); + if (k > MaxLevel) + k = MaxLevel; if (k > l->level) { k = ++l->level; update[k] = l->header; Index: linux-4.7-bfs504/include/linux/sched.h =================================================================== --- linux-4.7-bfs504.orig/include/linux/sched.h 2016-09-23 08:59:12.367546205 +1000 +++ linux-4.7-bfs504/include/linux/sched.h 2016-10-01 10:20:37.000000000 +1000 @@ -1953,7 +1953,6 @@ extern int arch_task_struct_size __read_ #endif #ifdef CONFIG_SCHED_BFS -bool grunqueue_is_locked(void); void grq_unlock_wait(void); void cpu_scaling(int cpu); void cpu_nonscaling(int cpu); @@ -1964,11 +1963,6 @@ static inline void tsk_cpus_current(stru { } -static inline int runqueue_is_locked(int cpu) -{ - return grunqueue_is_locked(); -} - void print_scheduler_version(void); static inline bool iso_task(struct task_struct *p) @@ -1976,7 +1970,6 @@ static inline bool iso_task(struct task_ return (p->policy == SCHED_ISO); } #else /* CFS */ -extern int runqueue_is_locked(int cpu); static inline void cpu_scaling(int cpu) { }