Index: linux-4.7-bfs504/kernel/sched/bfs.c
===================================================================
--- linux-4.7-bfs504.orig/kernel/sched/bfs.c	2016-09-27 14:28:18.223173282 +1000
+++ linux-4.7-bfs504/kernel/sched/bfs.c	2016-10-02 03:30:00.888158740 +1100
@@ -115,7 +115,7 @@
 
 #define rq_idle(rq)		((rq)->rq_prio == PRIO_LIMIT)
 
-#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
+#define ISO_PERIOD		((5 * HZ * num_online_cpus()) + 1)
 
 #define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
 #define STOP_PRIO		(MAX_RT_PRIO - 1)
@@ -137,7 +137,7 @@
 
 void print_scheduler_version(void)
 {
-	printk(KERN_INFO "BFS CPU scheduler v0.502 by Con Kolivas.\n");
+	printk(KERN_INFO "MuQSS CPU scheduler v0.102 by Con Kolivas.\n");
 }
 
 /*
@@ -174,30 +174,21 @@ static inline int timeslice(void)
 }
 
 /*
- * The global runqueue data that all CPUs work off. Data is protected either
- * by the global grq lock, or the discrete lock that precedes the data in this
- * struct.
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables or iso variables protected by iso_lock.
  */
 struct global_rq {
-	raw_spinlock_t lock;
-	unsigned long nr_running;
-	unsigned long nr_uninterruptible;
-	unsigned long long nr_switches;
-	unsigned long qnr; /* queued not running */
+	atomic_t nr_running;
+	atomic_t nr_uninterruptible;
+	atomic64_t nr_switches;
+	atomic_t qnr; /* queued not running */
 #ifdef CONFIG_SMP
 	cpumask_t cpu_idle_map;
 	bool idle_cpus;
 #endif
-	int noc; /* num_online_cpus stored and updated when it changes */
-	u64 niffies; /* Nanosecond jiffies */
-	unsigned long last_jiffy; /* Last jiffy we updated niffies */
-
 	raw_spinlock_t iso_lock;
 	int iso_ticks;
 	bool iso_refractory;
-
-	skiplist_node node;
-	skiplist *sl;
 };
 
 #ifdef CONFIG_SMP
@@ -296,10 +287,16 @@ static inline int cpu_of(struct rq *rq)
 {
 	return rq->cpu;
 }
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+	return 0;
+}
+#endif
 
 /*
  * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
+ * clock is updated with the rq->lock held, it is an opportunity to update the
  * niffies value. Any CPU can update it by adding how much its clock has
  * increased since it last updated niffies, minus any added niffies by other
  * CPUs.
@@ -311,35 +308,12 @@ static inline void update_clocks(struct
 
 	update_rq_clock(rq);
 	ndiff = rq->clock - rq->old_clock;
-	/* old_clock is only updated when we are updating niffies */
-	rq->old_clock = rq->clock;
-	ndiff -= grq.niffies - rq->last_niffy;
-	jdiff = jiffies - grq.last_jiffy;
-	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
-	rq->last_niffy = grq.niffies;
-}
-#else /* CONFIG_SMP */
-static inline int cpu_of(struct rq *rq)
-{
-	return 0;
-}
-
-static inline void update_clocks(struct rq *rq)
-{
-	s64 ndiff;
-	long jdiff;
-
-	update_rq_clock(rq);
-	ndiff = rq->clock - rq->old_clock;
 	rq->old_clock = rq->clock;
-	jdiff = jiffies - grq.last_jiffy;
+	jdiff = jiffies - rq->last_jiffy;
 	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
+	rq->last_jiffy += jdiff;
+	rq->niffies += ndiff;
 }
-#endif
 
 #include "stats.h"
 
@@ -354,10 +328,10 @@ static inline void update_clocks(struct
 #endif
 
 /*
- * All common locking functions performed on grq.lock. rq->clock is local to
+ * All common locking functions performed on rq->lock. rq->clock is local to
  * the CPU accessing it so it can be modified just with interrupts disabled
  * when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
+ * Looking up task_rq must be done under rq->lock to be safe.
  */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 
@@ -376,129 +350,246 @@ static inline bool task_running(struct t
 	return p->on_cpu;
 }
 
-static inline void grq_lock(void)
-	__acquires(grq.lock)
+static inline void rq_lock(struct rq *rq)
+	__acquires(rq->lock)
 {
-	raw_spin_lock(&grq.lock);
+	raw_spin_lock(&rq->lock);
 }
 
-static inline void grq_unlock(void)
-	__releases(grq.lock)
+static inline int rq_trylock(struct rq *rq)
+	__acquires(rq->lock)
 {
-	raw_spin_unlock(&grq.lock);
+	return raw_spin_trylock(&rq->lock);
 }
 
-static inline void grq_lock_irq(void)
-	__acquires(grq.lock)
+static inline void rq_unlock(struct rq *rq)
+	__releases(rq->lock)
 {
-	raw_spin_lock_irq(&grq.lock);
+	raw_spin_unlock(&rq->lock);
 }
 
-static inline void time_lock_grq(struct rq *rq)
-	__acquires(grq.lock)
+static inline struct rq *this_rq_lock(void)
+	__acquires(rq->lock)
 {
-	grq_lock();
-	update_clocks(rq);
+	struct rq *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+
+	return rq;
 }
 
-static inline void grq_unlock_irq(void)
-	__releases(grq.lock)
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+
+/* For when we know rq1 != rq2 */
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
 {
-	raw_spin_unlock_irq(&grq.lock);
+	if (rq1 < rq2) {
+		raw_spin_lock(&rq1->lock);
+		raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		raw_spin_lock(&rq2->lock);
+		raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+	}
 }
 
-static inline void grq_lock_irqsave(unsigned long *flags)
-	__acquires(grq.lock)
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
 {
-	raw_spin_lock_irqsave(&grq.lock, *flags);
+	BUG_ON(!irqs_disabled());
+	if (rq1 == rq2) {
+		raw_spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else
+		__double_rq_lock(rq1, rq2);
 }
 
-static inline void grq_unlock_irqrestore(unsigned long *flags)
-	__releases(grq.lock)
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
 {
-	raw_spin_unlock_irqrestore(&grq.lock, *flags);
+	raw_spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		raw_spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
 }
 
-static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+/* Must be sure rq1 != rq2 and irqs are disabled */
+static inline void lock_second_rq(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
 {
-	grq_lock_irqsave(flags);
-	return task_rq(p);
+	BUG_ON(!irqs_disabled());
+	if (unlikely(!raw_spin_trylock(&rq2->lock))) {
+		raw_spin_unlock(&rq1->lock);
+		__double_rq_lock(rq1, rq2);
+	}
 }
 
-static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+static inline void lock_all_rqs(void)
 {
-	struct rq *rq = task_grq_lock(p, flags);
-	update_clocks(rq);
-	return rq;
+	int cpu;
+
+	preempt_disable();
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_lock(&rq->lock);
+	}
 }
 
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+static inline void unlock_all_rqs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_unlock(&rq->lock);
+	}
+	preempt_enable();
+}
+
+/*
+ * Lock this_rq and as many rqs as we can grab with trylock, returning which
+ * rqs are locked in a bitmask.
+ */
+static inline void lock_rqs(struct rq *this_rq, cpumask_t *mask)
 {
-	grq_lock_irq();
-	return task_rq(p);
+	int cpu;
+
+	cpumask_clear(mask);
+
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		if (rq != this_rq) {
+			if (!do_raw_spin_trylock(&rq->lock))
+				continue;
+			spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+		}
+		cpumask_set_cpu(cpu, mask);
+	}
 }
 
-static inline void time_task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+/* Unlock all rqs in a CPU bitmask */
+static inline void unlock_rqs(struct rq *this_rq, cpumask_t *mask)
 {
-	struct rq *rq = task_grq_lock_irq(p);
+	int cpu;
+
+	cpumask_clear_cpu(this_rq->cpu, mask);
+
+	for_each_cpu(cpu, mask) {
+		struct rq *rq = cpu_rq(cpu);
+
+		spin_release(&rq->lock.dep_map, 1, _RET_IP_);
+		do_raw_spin_unlock(&rq->lock);
+	}
+}
+
+static inline void rq_lock_irq(struct rq *rq)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irq(&rq->lock);
+}
+
+static inline void time_lock_rq(struct rq *rq)
+{
+	rq_lock(rq);
 	update_clocks(rq);
 }
 
-static inline void task_grq_unlock_irq(void)
-	__releases(grq.lock)
+static inline void rq_unlock_irq(struct rq *rq)
+	__releases(rq->lock)
 {
-	grq_unlock_irq();
+	raw_spin_unlock_irq(&rq->lock);
 }
 
-static inline void task_grq_unlock(unsigned long *flags)
-	__releases(grq.lock)
+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags)
+	__acquires(rq->lock)
 {
-	grq_unlock_irqrestore(flags);
+	raw_spin_lock_irqsave(&rq->lock, *flags);
 }
 
-/**
- * grunqueue_is_locked
- *
- * Returns true if the global runqueue is locked.
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-bool grunqueue_is_locked(void)
+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags)
+	__releases(rq->lock)
 {
-	return raw_spin_is_locked(&grq.lock);
+	raw_spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
-void grq_unlock_wait(void)
-	__releases(grq.lock)
+static inline struct rq
+*task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(p->pi_lock)
+	__acquires(rq->lock)
 {
-	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-	raw_spin_unlock_wait(&grq.lock);
+	struct rq *rq;
+
+	while (42) {
+		raw_spin_lock_irqsave(&p->pi_lock, *flags);
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+	}
+	return rq;
 }
 
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
-	__acquires(grq.lock)
+static inline struct rq
+*time_task_rq_lock(struct task_struct *p, unsigned long *flags)
 {
-	local_irq_save(*flags);
-	time_lock_grq(rq);
+	struct rq *rq = task_rq_lock(p, flags);
+
+	update_clocks(rq);
+	return rq;
 }
 
-static inline struct rq *__task_grq_lock(struct task_struct *p)
-	__acquires(grq.lock)
+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+	__releases(rq->lock)
+	__releases(p->pi_lock)
 {
-	grq_lock();
-	return task_rq(p);
+	rq_unlock(rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
-static inline void __task_grq_unlock(void)
-	__releases(grq.lock)
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+	__acquires(rq->lock)
 {
-	grq_unlock();
+	struct rq *rq;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	while (42) {
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+	}
+	return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+{
+	rq_unlock(rq);
 }
 
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
@@ -509,16 +600,16 @@ static inline void finish_lock_switch(st
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
-	grq.lock.owner = current;
+	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
-	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 
-	grq_unlock_irq();
+	raw_spin_unlock_irq(&rq->lock);
 }
 
 static inline bool deadline_before(u64 deadline, u64 time)
@@ -532,6 +623,40 @@ static inline bool deadline_after(u64 de
 }
 
 /*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+	return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+	return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+	return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+	return NS_TO_MS(longest_deadline_diff());
+}
+
+/*
  * A task that is not running or queued will not have a node set.
  * A task that is queued but not running will have a node set.
  * A task that is currently running will have ->on_cpu set but no node set.
@@ -541,17 +666,53 @@ static inline bool task_queued(struct ta
 	return !skiplist_node_empty(&p->node);
 }
 
+static unsigned long rq_load_avg(struct rq *rq)
+{
+	return rq->sl->entries * SCHED_CAPACITY_SCALE;
+}
+
 /*
- * Removing from the global runqueue. Enter with grq locked. Deleting a task
+ * Update the load average for feeding into cpu frequency governors. Use a
+ * rough estimate of a rolling average with ~ time constant of 32ms.
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
+ */
+static void update_load_avg(struct rq *rq)
+{
+	/* rq clock can go backwards so skip update if that happens */
+	if (likely(rq->clock > rq->load_update)) {
+		unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
+		long load;
+
+		load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+		if (unlikely(load < 0))
+			load = 0;
+		load += rq->sl->entries * rq_load_avg(rq) * us_interval * 5 / 262144;
+		rq->load_avg = load;
+	}
+	rq->load_update = rq->clock;
+}
+
+/*
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
  * from the skip list is done via the stored node reference in the task struct
  * and does not require a full look up. Thus it occurs in O(k) time where k
- * is the "level" of the list the task was stored at - usually < 4, max 16.
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
  */
-static void dequeue_task(struct task_struct *p)
+static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
-	skiplist_delete(grq.sl, &p->node);
+	skiplist_delete(rq->sl, &p->node);
 	sched_info_dequeued(task_rq(p), p);
+	update_load_avg(rq);
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+	return p->rcu_read_unlock_special.b.blocked;
 }
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
 
 /*
  * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
@@ -559,8 +720,8 @@ static void dequeue_task(struct task_str
  */
 static bool idleprio_suitable(struct task_struct *p)
 {
-	return (!freezing(p) && !signal_pending(p) &&
-		!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
+	return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+		!signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
 }
 
 /*
@@ -573,7 +734,18 @@ static bool isoprio_suitable(void)
 }
 
 /*
- * Adding to the global runqueue. Enter with grq locked.
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
+		return true;
+	return false;
+}
+
+/*
+ * Adding to the runqueue. Enter with rq locked.
  */
 static void enqueue_task(struct task_struct *p, struct rq *rq)
 {
@@ -604,17 +776,22 @@ static void enqueue_task(struct task_str
 		sl_id = p->prio;
 	else {
 		sl_id = p->deadline;
-		/* Set it to cope with 4 left shifts with locality_diff */
-		if (p->prio == IDLE_PRIO)
-			sl_id |= 0x0F00000000000000;
+		if (idleprio_task(p)) {
+			/* Set it to cope with 4 left shifts with locality_diff */
+			if (p->prio == IDLE_PRIO)
+				sl_id |= 0x00FF000000000000;
+			else
+				sl_id += longest_deadline_diff();
+		}
 	}
 	/*
 	 * Some architectures don't have better than microsecond resolution
 	 * so mask out ~microseconds as the random seed for skiplist insertion.
 	 */
-	randseed = (grq.niffies >> 10) & 0xFFFFFFFF;
-	skiplist_insert(grq.sl, &p->node, sl_id, p, randseed);
+	randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
+	skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
 	sched_info_queued(rq, p);
+	update_load_avg(rq);
 }
 
 static inline void requeue_task(struct task_struct *p)
@@ -643,7 +820,7 @@ static inline int task_timeslice(struct
 
 static void resched_task(struct task_struct *p);
 
-static inline void resched_curr(struct rq *rq)
+static void resched_curr(struct rq *rq)
 {
 	resched_task(rq->curr);
 }
@@ -655,22 +832,17 @@ static inline void resched_curr(struct r
  */
 static inline void inc_qnr(void)
 {
-	grq.qnr++;
+	atomic_inc(&grq.qnr);
 }
 
 static inline void dec_qnr(void)
 {
-	grq.qnr--;
+	atomic_dec(&grq.qnr);
 }
 
 static inline int queued_notrunning(void)
 {
-	return grq.qnr;
-}
-
-static unsigned long rq_load_avg(struct rq *rq)
-{
-	return rq->soft_affined * SCHED_CAPACITY_SCALE;
+	return atomic_read(&grq.qnr);
 }
 
 #ifdef CONFIG_SMT_NICE
@@ -749,20 +921,33 @@ static bool smt_should_schedule(struct t
 #define smt_schedule(p, this_rq) (true)
 #endif /* CONFIG_SMT_NICE */
 #ifdef CONFIG_SMP
+
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
+{
+	set_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
 /*
  * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
  * allow easy lookup of whether any suitable idle CPUs are available.
  * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
- * idle_cpus variable than to do a full bitmask check when we are busy.
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
+ * bits are set atomically but read locklessly as occasional false positive /
+ * negative is harmless.
  */
 static inline void set_cpuidle_map(int cpu)
 {
 	if (likely(cpu_online(cpu))) {
-		cpumask_set_cpu(cpu, &grq.cpu_idle_map);
+		atomic_set_cpu(cpu, &grq.cpu_idle_map);
 		grq.idle_cpus = true;
 	}
 }
 
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
+{
+	clear_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
 static inline void clear_cpuidle_map(int cpu)
 {
 	cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
@@ -932,28 +1117,7 @@ static int effective_prio(struct task_st
 }
 
 /*
- * Update the load average for feeding into cpu frequency governors. Use a
- * rough estimate of a rolling average with ~ time constant of 32ms.
- * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
- */
-static void update_load_avg(struct rq *rq)
-{
-	/* rq clock can go backwards so skip update if that happens */
-	if (likely(rq->clock > rq->load_update)) {
-		unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
-		long load;
-
-		load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
-		if (unlikely(load < 0))
-			load = 0;
-		load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144;
-		rq->load_avg = load;
-	}
-	rq->load_update = rq->clock;
-}
-
-/*
- * activate_task - move a task to the runqueue. Enter with grq locked.
+ * activate_task - move a task to the runqueue. Enter with rq locked.
  */
 static void activate_task(struct task_struct *p, struct rq *rq)
 {
@@ -972,79 +1136,80 @@ static void activate_task(struct task_st
 
 	p->prio = effective_prio(p);
 	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible--;
+		atomic_dec(&grq.nr_uninterruptible);
 	enqueue_task(p, rq);
-	rq->soft_affined++;
 	p->on_rq = 1;
-	grq.nr_running++;
+	atomic_inc(&grq.nr_running);
 	inc_qnr();
 	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
+	cpufreq_trigger(rq->niffies, rq->load_avg);
 }
 
 /*
- * deactivate_task - If it's running, it's not on the grq and we can just
- * decrement the nr_running. Enter with grq locked.
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
+ * decrement the nr_running. Enter with rq locked.
  */
 static inline void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible++;
-	rq->soft_affined--;
+		atomic_inc(&grq.nr_uninterruptible);
+
 	p->on_rq = 0;
-	grq.nr_running--;
+	atomic_dec(&grq.nr_running);
 	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
+	cpufreq_trigger(rq->niffies, rq->load_avg);
 }
 
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
-	unsigned int tcpu;
+	struct rq *rq = task_rq(p);
+	bool queued;
 
 #ifdef CONFIG_LOCKDEP
 	/*
-	 * The caller should hold grq lock.
+	 * The caller should hold either p->pi_lock or rq->lock, when changing
+	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+	 *
+	 * Furthermore, all task_rq users should acquire both locks, see
+	 * task_rq_lock().
 	 */
-	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+				      lockdep_is_held(&task_rq(p)->lock)));
 #endif
-	if ((tcpu = task_cpu(p)) == cpu)
+	if (task_cpu(p) == cpu)
 		return;
 	trace_sched_migrate_task(p, cpu);
 	perf_event_task_migrate(p);
 
 	/*
-	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfully executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
-	if (p->on_rq) {
-		struct rq *rq = task_rq(p);
 
-		rq->soft_affined--;
-		update_load_avg(rq);
-		rq = cpu_rq(cpu);
-		rq->soft_affined++;
-		update_load_avg(rq);
-	}
+	if ((queued = task_queued(p)))
+		dequeue_task(p, rq);
 	task_thread_info(p)->cpu = cpu;
+	if (queued)
+		enqueue_task(p, cpu_rq(cpu));
 }
 #endif /* CONFIG_SMP */
 
 /*
- * Move a task off the global queue and take it to a cpu for it will
+ * Move a task off the runqueue and take it to a cpu for it will
  * become the running task.
  */
-static inline void take_task(int cpu, struct task_struct *p)
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
 {
+	dequeue_task(p, task_rq(p));
 	set_task_cpu(p, cpu);
-	dequeue_task(p);
 	dec_qnr();
 }
 
 /*
- * Returns a descheduling task to the grq runqueue unless it is being
+ * Returns a descheduling task to the runqueue unless it is being
  * deactivated.
  */
 static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
@@ -1057,7 +1222,7 @@ static inline void return_task(struct ta
 	}
 }
 
-/* Enter with grq lock held. We know p is on the local cpu */
+/* Enter with rq lock held. We know p is on the local cpu */
 static inline void __set_tsk_resched(struct task_struct *p)
 {
 	set_tsk_need_resched(p);
@@ -1075,11 +1240,10 @@ void resched_task(struct task_struct *p)
 {
 	int cpu;
 
-	lockdep_assert_held(&grq.lock);
-
 	if (test_tsk_need_resched(p))
 		return;
 
+	/* We're doing this without holding the rq lock if it's not task_rq */
 	set_tsk_need_resched(p);
 
 	cpu = task_cpu(p);
@@ -1151,14 +1315,14 @@ unsigned long wait_task_inactive(struct
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
-		rq = task_grq_lock(p, &flags);
+		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(p);
 		on_rq = p->on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_grq_unlock(&flags);
+		task_rq_unlock(rq, p, &flags);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -1271,17 +1435,6 @@ static inline bool online_cpus(struct ta
 }
 #endif
 
-/*
- * Check to see if p can run on cpu, and if not, whether there are any online
- * CPUs it can run on instead.
- */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
-	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
-		return true;
-	return false;
-}
-
 static void try_preempt(struct task_struct *p, struct rq *this_rq)
 {
 	int cpu, pcpu, highest_prio, highest_cpu;
@@ -1307,6 +1460,8 @@ static void try_preempt(struct task_stru
 		}
 		cpumask_clear_cpu(pcpu, &tmp);
 	}
+	if (!sched_interactive)
+		return;
 
 	highest_prio = latest_deadline = 0;
 	highest_prio_rq = NULL;
@@ -1315,19 +1470,15 @@ static void try_preempt(struct task_stru
 	for_each_cpu(cpu, &tmp) {
 		struct rq *rq;
 		int rq_prio;
-		u64 dl;
 
 		rq = cpu_rq(cpu);
 		rq_prio = rq->rq_prio;
 		if (rq_prio < highest_prio)
 			continue;
 
-		dl = rq->rq_deadline;
-		if (!sched_interactive && pcpu != cpu)
-			dl <<= locality_diff(pcpu, rq);
 		if (rq_prio > highest_prio ||
-		    deadline_after(dl, latest_deadline)) {
-			latest_deadline = dl;
+		    deadline_after(rq->rq_deadline, latest_deadline)) {
+			latest_deadline = rq->rq_deadline;
 			highest_prio = rq_prio;
 			highest_cpu = cpu;
 			highest_prio_rq = rq;
@@ -1338,15 +1489,8 @@ static void try_preempt(struct task_stru
 		return;
 	if (!smt_schedule(p, highest_prio_rq))
 		return;
-	if (can_preempt(p, highest_prio, latest_deadline)) {
-		/*
-		 * If we have decided this task should preempt this CPU,
-		 * set the task's CPU to match thereby speeding up matching
-		 * this task in earliest_deadline_task.
-		 */
-		set_task_cpu(p, highest_cpu);
+	if (can_preempt(p, highest_prio, latest_deadline))
 		resched_curr(highest_prio_rq);
-	}
 }
 static int __set_cpus_allowed_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask, bool check);
@@ -1411,11 +1555,11 @@ void wake_up_if_idle(int cpu)
 	if (!is_idle_task(rcu_dereference(rq->curr)))
 		goto out;
 
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 	if (likely(is_idle_task(rq->curr)))
 		smp_send_reschedule(cpu);
 	/* Else cpu is not in idle, do nothing here */
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 
 out:
 	rcu_read_unlock();
@@ -1493,8 +1637,6 @@ static bool try_to_wake_up(struct task_s
 	struct rq *rq;
 	int cpu;
 
-	get_cpu();
-
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
 	 * need to ensure that CONDITION=1 done by the caller can not be
@@ -1507,7 +1649,7 @@ static bool try_to_wake_up(struct task_s
 	 * No need to do time_lock_grq as we only need to update the rq clock
 	 * if we activate the task
 	 */
-	rq = task_grq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	cpu = task_cpu(p);
 
 	/* state is a volatile long, どうして、分からない */
@@ -1525,13 +1667,11 @@ static bool try_to_wake_up(struct task_s
 out_running:
 	ttwu_post_activation(p, rq, success);
 out_unlock:
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	if (schedstat_enabled())
 		ttwu_stat(p, cpu, wake_flags);
 
-	put_cpu();
-
 	return success;
 }
 
@@ -1548,10 +1688,26 @@ static void try_to_wake_up_local(struct
 	struct rq *rq = task_rq(p);
 	bool success = false;
 
-	lockdep_assert_held(&grq.lock);
+	if (WARN_ON_ONCE(rq != this_rq()) ||
+	    WARN_ON_ONCE(p == current))
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		raw_spin_unlock(&rq->lock);
+		raw_spin_lock(&p->pi_lock);
+		raw_spin_lock(&rq->lock);
+	}
 
 	if (!(p->state & TASK_NORMAL))
-		return;
+		goto out;
 
 	trace_sched_waking(p);
 
@@ -1566,6 +1722,8 @@ static void try_to_wake_up_local(struct
 		success = true;
 	}
 	ttwu_post_activation(p, rq, success);
+out:
+	raw_spin_unlock(&p->pi_lock);
 }
 
 /**
@@ -1591,7 +1749,7 @@ int wake_up_state(struct task_struct *p,
 	return try_to_wake_up(p, state, 0);
 }
 
-static void time_slice_expired(struct task_struct *p);
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
 
 /*
  * Perform scheduler related setup for a newly forked process p.
@@ -1599,6 +1757,9 @@ static void time_slice_expired(struct ta
  */
 int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
 {
+	unsigned long flags;
+	int cpu = get_cpu();
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
@@ -1641,12 +1802,21 @@ int sched_fork(unsigned long __maybe_unu
 		p->sched_reset_on_fork = 0;
 	}
 
+	/*
+	 * Silence PROVE_RCU.
+	 */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	set_task_cpu(p, cpu);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
 #ifdef CONFIG_SCHED_INFO
 	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 	p->on_cpu = false;
 	init_task_preempt_count(p);
+
+	put_cpu();
 	return 0;
 }
 
@@ -1736,12 +1906,17 @@ static inline void init_schedstats(void)
  */
 void wake_up_new_task(struct task_struct *p)
 {
-	struct task_struct *parent;
+	struct task_struct *parent, *rq_curr;
 	unsigned long flags;
 	struct rq *rq;
 
 	parent = p->parent;
-	rq = task_grq_lock(p, &flags);
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	if (unlikely(needs_other_cpu(p, task_cpu(p))))
+		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+	rq = __task_rq_lock(p);
+	rq_curr = rq->curr;
 
 	/*
 	 * Reinit new task deadline as its creator deadline could have changed
@@ -1750,21 +1925,12 @@ void wake_up_new_task(struct task_struct
 	p->deadline = rq->rq_deadline;
 
 	/*
-	 * If the task is a new process, current and parent are the same. If
-	 * the task is a new thread in the thread group, it will have much more
-	 * in common with current than with the parent.
-	 */
-	set_task_cpu(p, task_cpu(rq->curr));
-
-	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
-	p->prio = rq->curr->normal_prio;
+	p->prio = rq_curr->normal_prio;
 
 	activate_task(p, rq);
 	trace_sched_wakeup_new(p);
-	if (unlikely(p->policy == SCHED_FIFO))
-		goto after_ts_init;
 
 	/*
 	 * Share the timeslice between parent and child, thus the
@@ -1776,33 +1942,37 @@ void wake_up_new_task(struct task_struct
 	 * is always equal to current->deadline.
 	 */
 	p->last_ran = rq->rq_last_ran;
-	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+	if (likely(rq_curr->policy != SCHED_FIFO)) {
 		rq->rq_time_slice /= 2;
-		p->time_slice = rq->rq_time_slice;
-after_ts_init:
-		if (rq->curr == parent && !suitable_idle_cpus(p)) {
+		if (unlikely(rq->rq_time_slice < RESCHED_US)) {
 			/*
-			 * The VM isn't cloned, so we're in a good position to
-			 * do child-runs-first in anticipation of an exec. This
-			 * usually avoids a lot of COW overhead.
+			 * Forking task has run out of timeslice. Reschedule it and
+			 * start its child with a new time slice and deadline. The
+			 * child will end up running first because its deadline will
+			 * be slightly earlier.
 			 */
-			__set_tsk_resched(parent);
-		} else
-			try_preempt(p, rq);
-	} else {
-		if (rq->curr == parent) {
-			/*
-		 	* Forking task has run out of timeslice. Reschedule it and
-		 	* start its child with a new time slice and deadline. The
-		 	* child will end up running first because its deadline will
-		 	* be slightly earlier.
-		 	*/
 			rq->rq_time_slice = 0;
-			__set_tsk_resched(parent);
+			__set_tsk_resched(rq_curr);
+			time_slice_expired(p, rq);
+			if (suitable_idle_cpus(p))
+				resched_best_idle(p);
+		} else {
+			p->time_slice = rq->rq_time_slice;
+			if (rq_curr == parent && !suitable_idle_cpus(p)) {
+				/*
+				 * The VM isn't cloned, so we're in a good position to
+				 * do child-runs-first in anticipation of an exec. This
+				 * usually avoids a lot of COW overhead.
+				 */
+				__set_tsk_resched(rq_curr);
+			} else
+				try_preempt(p, rq);
 		}
-		time_slice_expired(p);
+	} else {
+		time_slice_expired(p, rq);
+		try_preempt(p, rq);
 	}
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1936,7 +2106,7 @@ prepare_task_switch(struct rq *rq, struc
  * because prev may have moved to another CPU.
  */
 static struct rq *finish_task_switch(struct task_struct *prev)
-	__releases(grq.lock)
+	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
@@ -1996,7 +2166,7 @@ static struct rq *finish_task_switch(str
  * @prev: the thread we just switched away from.
  */
 asmlinkage __visible void schedule_tail(struct task_struct *prev)
-	__releases(grq.lock)
+	__releases(rq->lock)
 {
 	struct rq *rq;
 
@@ -2053,7 +2223,7 @@ context_switch(struct rq *rq, struct tas
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
@@ -2066,26 +2236,16 @@ context_switch(struct rq *rq, struct tas
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup. All are
- * measured without grabbing the grq lock but the occasional inaccurate result
- * doesn't matter so long as it's positive.
+ * threads, total number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
-	long nr = grq.nr_running;
-
-	if (unlikely(nr < 0))
-		nr = 0;
-	return (unsigned long)nr;
+	return atomic_read(&grq.nr_running);
 }
 
 static unsigned long nr_uninterruptible(void)
 {
-	long nu = grq.nr_uninterruptible;
-
-	if (unlikely(nu < 0))
-		nu = 0;
-	return nu;
+	return atomic_read(&grq.nr_uninterruptible);
 }
 
 /*
@@ -2103,7 +2263,7 @@ static unsigned long nr_uninterruptible(
  */
 bool single_task_running(void)
 {
-	if (cpu_rq(smp_processor_id())->soft_affined == 1)
+	if (cpu_rq(smp_processor_id())->sl->entries == 1)
 		return true;
 	else
 		return false;
@@ -2112,12 +2272,7 @@ EXPORT_SYMBOL(single_task_running);
 
 unsigned long long nr_context_switches(void)
 {
-	long long ns = grq.nr_switches;
-
-	/* This is of course impossible */
-	if (unlikely(ns < 0))
-		ns = 1;
-	return (unsigned long long)ns;
+	return (unsigned long long)atomic64_read(&grq.nr_switches);
 }
 
 unsigned long nr_iowait(void)
@@ -2149,7 +2304,7 @@ void get_iowait_load(unsigned long *nr_w
 	struct rq *rq = this_rq();
 
 	*nr_waiters = atomic_read(&rq->nr_iowait);
-	*load = rq->soft_affined;
+	*load = rq->sl->entries;
 }
 
 /* Variables and functions for calc_load */
@@ -2665,7 +2820,7 @@ ts_account:
  * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
  *
- * Called with task_grq_lock() held.
+ * Called with task_rq_lock(p) held.
  */
 static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
@@ -2714,9 +2869,9 @@ unsigned long long task_sched_runtime(st
 		return tsk_seruntime(p);
 #endif
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	ns = p->sched_time + do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -2965,19 +3120,17 @@ static void task_running_tick(struct rq
 	} else if (rq->rq_time_slice >= RESCHED_US)
 			return;
 
-	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
 	p = rq->curr;
 
-	grq_lock();
+	rq_lock(rq);
 	requeue_task(p);
 	__set_tsk_resched(p);
-	grq_unlock();
+	rq_unlock(rq);
 }
 
 /*
  * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled. The data modified is all
- * local to struct rq so we don't need to grab grq lock.
+ * We call it with interrupts disabled.
  */
 void scheduler_tick(void)
 {
@@ -2985,11 +3138,10 @@ void scheduler_tick(void)
 	struct rq *rq = cpu_rq(cpu);
 
 	sched_clock_tick();
-	/* grq lock not grabbed, so only update rq clock */
 	update_rq_clock(rq);
 	update_cpu_clock_tick(rq, rq->curr);
 	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
+	cpufreq_trigger(rq->niffies, rq->load_avg);
 	if (!rq_idle(rq))
 		task_running_tick(rq);
 	else
@@ -3075,47 +3227,13 @@ static inline void preempt_latency_stop(
 #endif
 
 /*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
-	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
-	return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
-	return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
-	return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
-	return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
  * The time_slice is only refilled when it is empty and that is when we set a
  * new deadline.
  */
-static void time_slice_expired(struct task_struct *p)
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
 {
 	p->time_slice = timeslice();
-	p->deadline = grq.niffies + task_deadline_diff(p);
+	p->deadline = rq->niffies + task_deadline_diff(p);
 #ifdef CONFIG_SMT_NICE
 	if (!p->mm)
 		p->smt_bias = 0;
@@ -3142,10 +3260,10 @@ static void time_slice_expired(struct ta
  * SCHED_NORMAL tasks.
 
  */
-static inline void check_deadline(struct task_struct *p)
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
 {
 	if (p->time_slice < RESCHED_US || batch_task(p))
-		time_slice_expired(p);
+		time_slice_expired(p, rq);
 }
 
 #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
@@ -3202,46 +3320,60 @@ found_middle:
  * task in the sorted list, an O(1) operation. The only time it takes longer
  * is if tasks do not have suitable affinity and then we iterate over entries
  * till we find the first that does. Worst case here is no tasks with suitable
- * affinity and taking O(n).
+ * affinity and taking O(k) where k is number of processors.
+ *
+ * As many runqueues as can be locked without contention are grabbed via
+ * lock_rqs and only those runqueues are examined. All balancing between CPUs
+ * is thus done here in an extremely simple first come best fit manner.
+ *
+ * This iterates over runqueues in cache locality order. In interactive mode
+ * it iterates over all CPUs and finds the task with the earliest deadline.
+ * In non-interactive mode it grabs the first task it finds, being the closest
+ * to the current CPU in cache locality.
  */
 static inline struct
 task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
 {
 	struct task_struct *edt = idle;
-	skiplist_node *node = &grq.node;
 	u64 earliest_deadline = ~0ULL;
+	cpumask_t locked;
+	int i;
 
-	while ((node = node->next[0]) != &grq.node) {
-		struct task_struct *p = node->value;
-		int tcpu;
+	lock_rqs(rq, &locked);
 
-		/* Make sure affinity is ok */
-		if (needs_other_cpu(p, cpu))
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *other_rq = rq->rq_order[i];
+		struct task_struct *p;
+		skiplist_node *node;
+
+		if (!cpumask_test_cpu(other_rq->cpu, &locked))
 			continue;
+		if ((node = other_rq->node.next[0]) == &other_rq->node)
+			continue;
+		p = node->value;
 
 		if (!smt_schedule(p, rq))
 			continue;
 
-		if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
-			u64 dl = p->deadline << locality_diff(tcpu, rq);
+		/* Make sure affinity is ok */
+		if (rq != other_rq && needs_other_cpu(p, cpu))
+			continue;
 
-			if (unlikely(!deadline_before(dl, earliest_deadline)))
-				continue;
-			earliest_deadline = dl;
+		if (!sched_interactive) {
 			edt = p;
-			/* We continue even though we've found the earliest
-			 * deadline task as the locality offset means there
-			 * may be a better candidate after it. */
-			continue;
+			break;
 		}
-		/* This wouldn't happen if we encountered a better deadline from
-		 * another CPU and have already set edt. */
-		if (likely(p->deadline < earliest_deadline))
-			edt = p;
-		break;
+
+		if (!deadline_before(p->deadline, earliest_deadline))
+			continue;
+		earliest_deadline = p->deadline;
+		edt = p;
 	}
+
 	if (likely(edt != idle))
-		take_task(cpu, edt);
+		take_task(rq, cpu, edt);
+	unlock_rqs(rq, &locked);
+
 	return edt;
 }
 
@@ -3294,8 +3426,7 @@ static inline void schedule_debug(struct
 
 /*
  * The currently running task's information is all stored in rq local data
- * which is only modified by the local CPU, thereby allowing the data to be
- * changed without grabbing the grq lock.
+ * which is only modified by the local CPU.
  */
 static inline void set_rq_task(struct rq *rq, struct task_struct *p)
 {
@@ -3451,7 +3582,7 @@ static void __sched notrace __schedule(b
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
 	smp_mb__before_spinlock();
-	grq_lock();
+	rq_lock(rq);
 
 	switch_count = &prev->nivcsw;
 	if (!preempt && prev->state) {
@@ -3498,7 +3629,7 @@ static void __sched notrace __schedule(b
 		/* Update all the information stored on struct rq */
 		prev->time_slice = rq->rq_time_slice;
 		prev->deadline = rq->rq_deadline;
-		check_deadline(prev);
+		check_deadline(prev, rq);
 		prev->last_ran = rq->clock_task;
 		return_task(prev, rq, deactivate);
 	}
@@ -3530,19 +3661,17 @@ static void __sched notrace __schedule(b
 			check_siblings(rq);
 		else
 			wake_siblings(rq);
-		grq.nr_switches++;
+		atomic64_inc(&grq.nr_switches);
 		prev->on_cpu = false;
 		next->on_cpu = true;
 		rq->curr = next;
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
-		rq = context_switch(rq, prev, next); /* unlocks the grq */
-		cpu = cpu_of(rq);
-		idle = rq->idle;
+		rq = context_switch(rq, prev, next); /* unlocks the rq */
 	} else {
 		check_siblings(rq);
-		grq_unlock_irq();
+		rq_unlock_irq(rq);
 	}
 }
 
@@ -3757,13 +3886,12 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	unsigned long flags;
-	int queued, oldprio;
 	struct rq *rq;
+	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
-	rq = task_grq_lock(p, &flags);
+	rq = __task_rq_lock(p);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3785,19 +3913,18 @@ void rt_mutex_setprio(struct task_struct
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 	p->prio = prio;
-	if (task_running(p) && prio > oldprio)
-		resched_task(p);
-	if (queued) {
+	if (task_running(p)){
+		if (prio > oldprio)
+			resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(p, rq);
 		enqueue_task(p, rq);
-		try_preempt(p, rq);
+		if (prio < oldprio)
+			try_preempt(p, rq);
 	}
-
 out_unlock:
-	task_grq_unlock(&flags);
+	__task_rq_unlock(rq);
 }
 
 #endif
@@ -3813,7 +3940,7 @@ static inline void adjust_deadline(struc
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int queued, new_static, old_static;
+	int new_static, old_static;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3824,7 +3951,7 @@ void set_user_nice(struct task_struct *p
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = time_task_grq_lock(p, &flags);
+	rq = time_task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -3835,16 +3962,14 @@ void set_user_nice(struct task_struct *p
 		p->static_prio = new_static;
 		goto out_unlock;
 	}
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 
 	adjust_deadline(p, new_static);
 	old_static = p->static_prio;
 	p->static_prio = new_static;
 	p->prio = effective_prio(p);
 
-	if (queued) {
+	if (task_queued(p)) {
+		dequeue_task(p, rq);
 		enqueue_task(p, rq);
 		if (new_static < old_static)
 			try_preempt(p, rq);
@@ -3854,7 +3979,7 @@ void set_user_nice(struct task_struct *p
 			resched_task(p);
 	}
 out_unlock:
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -3925,7 +4050,7 @@ int task_prio(const struct task_struct *
 		goto out;
 
 	/* Convert to ms to avoid overflows */
-	delta = NS_TO_MS(p->deadline - grq.niffies);
+	delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
 	delta = delta * 40 / ms_longest_deadline_diff();
 	if (delta > 0 && delta <= 80)
 		prio += delta;
@@ -3968,7 +4093,7 @@ static inline struct task_struct *find_p
 	return pid ? find_task_by_vpid(pid) : current;
 }
 
-/* Actually do priority change: must hold grq lock. */
+/* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
 			   int prio, bool keep_boost)
 {
@@ -3994,11 +4119,17 @@ static void __setscheduler(struct task_s
 		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
 	} else
 		p->prio = p->normal_prio;
+
 	if (task_running(p)) {
 		reset_rq_task(rq, p);
 		/* Resched only if we might now be preempted */
-		if (p->prio > oldprio || p->rt_priority > oldrtprio)
+		if (p->prio > oldprio || p->rt_priority < oldrtprio)
 			resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(p, rq);
+		enqueue_task(p, rq);
+		if (p->prio < oldprio || p->rt_priority > oldrtprio)
+			try_preempt(p, rq);
 	}
 }
 
@@ -4023,8 +4154,8 @@ __sched_setscheduler(struct task_struct
 		     const struct sched_param *param, bool user, bool pi)
 {
 	struct sched_param zero_param = { .sched_priority = 0 };
-	int queued, retval, oldpolicy = -1;
 	unsigned long flags, rlim_rtprio = 0;
+	int retval, oldpolicy = -1;
 	int reset_on_fork;
 	struct rq *rq;
 
@@ -4134,20 +4265,17 @@ recheck:
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-	 */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	/*
-	 * To be able to change p->policy safely, the grunqueue lock must be
+	 *
+	 * To be able to change p->policy safely, the runqueue lock must be
 	 * held.
 	 */
-	rq = __task_grq_lock(p);
+	rq = task_rq_lock(p, &flags);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_rq_unlock(rq, p, &flags);
 		return -EINVAL;
 	}
 
@@ -4156,32 +4284,21 @@ recheck:
 	 */
 	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
-
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
 
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_rq_unlock(rq, p, &flags);
 		goto recheck;
 	}
 	update_clocks(rq);
 	p->sched_reset_on_fork = reset_on_fork;
 
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 	__setscheduler(p, rq, policy, param->sched_priority, pi);
-	if (queued) {
-		enqueue_task(p, rq);
-		try_preempt(p, rq);
-	}
-	__task_grq_unlock();
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
@@ -4681,9 +4798,9 @@ long sched_getaffinity(pid_t pid, cpumas
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
-	grq_unlock_irqrestore(&flags);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
 	rcu_read_unlock();
@@ -4740,9 +4857,10 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t
 SYSCALL_DEFINE0(sched_yield)
 {
 	struct task_struct *p;
+	struct rq *rq;
 
 	p = current;
-	grq_lock_irq();
+	rq = this_rq_lock();
 	schedstat_inc(task_rq(p), yld_count);
 	requeue_task(p);
 
@@ -4750,9 +4868,9 @@ SYSCALL_DEFINE0(sched_yield)
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(grq.lock);
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-	do_raw_spin_unlock(&grq.lock);
+	__release(rq->lock);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&rq->lock);
 	sched_preempt_enable_no_resched();
 
 	schedule();
@@ -4862,14 +4980,26 @@ int __sched yield_to(struct task_struct
 	unsigned long flags;
 	int yielded = 0;
 
+	local_irq_save(flags);
 	rq = this_rq();
-	grq_lock_irqsave(&flags);
+
+again:
+	p_rq = task_rq(p);
+	/*
+	 * If we're the only runnable task on the rq and target rq also
+	 * has only one task, there's absolutely no point in yielding.
+	 */
 	if (task_running(p) || p->state) {
 		yielded = -ESRCH;
-		goto out_unlock;
+		goto out_irq;
+	}
+
+	double_rq_lock(rq, p_rq);
+	if (task_rq(p) != p_rq) {
+		double_rq_unlock(rq, p_rq);
+		goto again;
 	}
 
-	p_rq = task_rq(p);
 	yielded = 1;
 	if (p->deadline > rq->rq_deadline)
 		p->deadline = rq->rq_deadline;
@@ -4878,9 +5008,10 @@ int __sched yield_to(struct task_struct
 	if (p->time_slice > timeslice())
 		p->time_slice = timeslice();
 	if (preempt && rq != p_rq)
-		resched_curr(p_rq);
-out_unlock:
-	grq_unlock_irqrestore(&flags);
+		resched_task(p_rq->curr);
+	double_rq_unlock(rq, p_rq);
+out_irq:
+	local_irq_restore(flags);
 
 	if (yielded > 0)
 		schedule();
@@ -4986,8 +5117,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p
 	struct task_struct *p;
 	unsigned int time_slice;
 	unsigned long flags;
-	int retval;
 	struct timespec t;
+	struct rq *rq;
+	int retval;
 
 	if (pid < 0)
 		return -EINVAL;
@@ -5002,9 +5134,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	rq = task_rq_lock(p, &flags);
 	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
-	grq_unlock_irqrestore(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	rcu_read_unlock();
 	t = ns_to_timespec(time_slice);
@@ -5104,7 +5236,21 @@ void set_cpus_allowed_common(struct task
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&p->pi_lock);
+
 	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+
+	if (task_queued(p)) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+	}
+	if (needs_other_cpu(p, task_cpu(p)))
+		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
 }
 #endif
 
@@ -5122,7 +5268,7 @@ void init_idle(struct task_struct *idle,
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
-	time_lock_grq(rq);
+	raw_spin_lock(&rq->lock);
 	idle->last_ran = rq->clock_task;
 	idle->state = TASK_RUNNING;
 	/* Setting prio to illegal value shouldn't matter when never queued */
@@ -5151,7 +5297,7 @@ void init_idle(struct task_struct *idle,
 
 	rq->curr = rq->idle = idle;
 	idle->on_cpu = 1;
-	grq_unlock();
+	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
@@ -5237,11 +5383,12 @@ void wake_up_q(struct wake_q_head *head)
 
 void resched_cpu(int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 	resched_task(cpu_curr(cpu));
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 }
 
 #ifdef CONFIG_SMP
@@ -5368,12 +5515,13 @@ static int __set_cpus_allowed_ptr(struct
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	bool running_wrong = false;
+	struct cpumask old_mask;
 	bool queued = false;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -5391,7 +5539,8 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+	cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+	if (cpumask_equal(&old_mask, new_mask))
 		goto out;
 
 	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -5424,13 +5573,18 @@ static int __set_cpus_allowed_ptr(struct
 			running_wrong = true;
 		} else
 			resched_task(p);
-	} else
-		set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
+	} else {
+		int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+		struct rq *dest_rq = cpu_rq(dest_cpu);
 
+		lock_second_rq(rq, dest_rq);
+		set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
+		rq_unlock(dest_rq);
+	}
 out:
-	if (queued)
+	if (queued && !cpumask_subset(new_mask, &old_mask))
 		try_preempt(p, rq);
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	if (running_wrong)
 		preempt_schedule_common();
@@ -5447,8 +5601,11 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 static bool sched_smp_initialized __read_mostly;
 
 #ifdef CONFIG_HOTPLUG_CPU
-/* Run through task list and find tasks affined to the dead cpu, then remove
- * that cpu from the list, enable cpu0 and set the zerobound flag. */
+/*
+ * Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
+ * cpu 0 and src_cpu's runqueue locks.
+ */
 static void bind_zero(int src_cpu)
 {
 	struct task_struct *p, *t;
@@ -5463,6 +5620,11 @@ static void bind_zero(int src_cpu)
 			cpumask_set_cpu(0, tsk_cpus_allowed(p));
 			p->zerobound = true;
 			bound++;
+			if (task_cpu(p) == src_cpu) {
+				set_task_cpu(p, 0);
+				if (task_running(p))
+					resched_task(p);
+			}
 		}
 	} while_each_thread(t, p);
 
@@ -5876,7 +6038,7 @@ static void rq_attach_root(struct rq *rq
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 
 	if (rq->rd) {
 		old_rd = rq->rd;
@@ -5902,7 +6064,7 @@ static void rq_attach_root(struct rq *rq
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 
 	if (old_rd)
 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
@@ -6881,14 +7043,13 @@ int sched_cpu_activate(unsigned int cpu)
 	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
 	 *    domains.
 	 */
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_online(rq);
 	}
 	unbind_zero(cpu);
-	grq.noc = num_online_cpus();
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 
 	return 0;
 }
@@ -6936,14 +7097,15 @@ int sched_cpu_dying(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	local_irq_save(flags);
+	double_rq_lock(rq, cpu_rq(0));
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
 	bind_zero(cpu);
-	grq.noc = num_online_cpus();
-	grq_unlock_irqrestore(&flags);
+	double_rq_unlock(rq, cpu_rq(0));
+	local_irq_restore(flags);
 
 	return 0;
 }
@@ -7000,8 +7162,8 @@ void __init sched_init_smp(void)
 #ifdef CONFIG_SCHED_SMT
 	bool smt_threads = false;
 #endif
-
 	cpumask_var_t non_isolated_cpus;
+	struct rq *rq;
 
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -7026,7 +7188,8 @@ void __init sched_init_smp(void)
 	free_cpumask_var(non_isolated_cpus);
 
 	mutex_lock(&sched_domains_mutex);
-	grq_lock_irq();
+	local_irq_disable();
+	lock_all_rqs();
 	/*
 	 * Set up the relative cache distance of each online cpu from each
 	 * other in a simple array for quick lookup. Locality is determined
@@ -7037,7 +7200,7 @@ void __init sched_init_smp(void)
 	 * nodes) are treated as very distant.
 	 */
 	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
+		rq = cpu_rq(cpu);
 
 		/* First check if this cpu is in the same node */
 		for_each_domain(cpu, sd) {
@@ -7076,6 +7239,17 @@ void __init sched_init_smp(void)
 		}
 #endif
 	}
+	for_each_possible_cpu(cpu) {
+		int total_cpus = 0, locality;
+
+		rq = cpu_rq(cpu);
+		for (locality = 0; locality <= 4; locality++) {
+			for_each_possible_cpu(other_cpu) {
+				if (rq->cpu_locality[other_cpu] == locality)
+					rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+			}
+		}
+	}
 #ifdef CONFIG_SMT_NICE
 	if (smt_threads) {
 		check_siblings = &check_smt_siblings;
@@ -7083,11 +7257,13 @@ void __init sched_init_smp(void)
 		smt_schedule = &smt_should_schedule;
 	}
 #endif
-	grq_unlock_irq();
+	unlock_all_rqs();
+	local_irq_enable();
 	mutex_unlock(&sched_domains_mutex);
 
 	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
+		rq = cpu_rq(cpu);
+
 		for_each_online_cpu(other_cpu) {
 			if (other_cpu <= cpu)
 				continue;
@@ -7145,21 +7321,18 @@ void __init sched_init(void)
 	for (i = 1 ; i < NICE_WIDTH ; i++)
 		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
 
-	raw_spin_lock_init(&grq.lock);
-	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
-	grq.niffies = 0;
-	grq.last_jiffy = jiffies;
+	atomic_set(&grq.nr_running, 0);
+	atomic_set(&grq.nr_uninterruptible, 0);
+	atomic64_set(&grq.nr_switches, 0);
 	raw_spin_lock_init(&grq.iso_lock);
 	grq.iso_ticks = 0;
 	grq.iso_refractory = false;
-	grq.noc = 1;
-	skiplist_init(&grq.node);
-	grq.sl = new_skiplist(&grq.node);
 	skiplist_node_init(&init_task.node);
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
-	grq.qnr = grq.idle_cpus = 0;
+	atomic_set(&grq.qnr, 0);
+	grq.idle_cpus = 0;
 	cpumask_clear(&grq.cpu_idle_map);
 #else
 	uprq = &per_cpu(runqueues, 0);
@@ -7174,12 +7347,15 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
-		rq->grq_lock = &grq.lock;
+		skiplist_init(&rq->node);
+		rq->sl = new_skiplist(&rq->node);
+		raw_spin_lock_init(&rq->lock);
+		rq->niffies = 0;
+		rq->last_jiffy = jiffies;
 		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
 			      rq->iowait_pc = rq->idle_pc = 0;
 		rq->dither = false;
 #ifdef CONFIG_SMP
-		rq->last_niffy = 0;
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->online = false;
@@ -7212,6 +7388,10 @@ void __init sched_init(void)
 			else
 				rq->cpu_locality[j] = 4;
 		}
+		rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+		rq->rq_order[0] = rq;
+		for (j = 1; j < cpu_ids; j++)
+			rq->rq_order[j] = cpu_rq(j);
 	}
 #endif
 
@@ -7315,7 +7495,6 @@ static inline void normalise_rt_tasks(vo
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
-	int queued;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7328,17 +7507,9 @@ static inline void normalise_rt_tasks(vo
 		if (!rt_task(p) && !iso_task(p))
 			continue;
 
-		rq = task_grq_lock(p, &flags);
-		queued = task_queued(p);
-		if (queued)
-			dequeue_task(p);
+		rq = task_rq_lock(p, &flags);
 		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
-		if (queued) {
-			enqueue_task(p, rq);
-			try_preempt(p, rq);
-		}
-
-		task_grq_unlock(&flags);
+		task_rq_unlock(rq, p, &flags);
 	}
 	read_unlock(&tasklist_lock);
 }
Index: linux-4.7-bfs504/include/linux/skip_lists.h
===================================================================
--- linux-4.7-bfs504.orig/include/linux/skip_lists.h	2016-09-23 08:59:12.374546442 +1000
+++ linux-4.7-bfs504/include/linux/skip_lists.h	2016-09-27 14:57:30.913307009 +1000
@@ -9,8 +9,8 @@ struct nodeStructure {
 	int level;	/* Levels in this structure */
 	keyType key;
 	valueType value;
-	skiplist_node *next[16];
-	skiplist_node *prev[16];
+	skiplist_node *next[8];
+	skiplist_node *prev[8];
 };
 
 typedef struct listStructure {
Index: linux-4.7-bfs504/kernel/sched/bfs_sched.h
===================================================================
--- linux-4.7-bfs504.orig/kernel/sched/bfs_sched.h	2016-09-23 08:59:12.373546408 +1000
+++ linux-4.7-bfs504/kernel/sched/bfs_sched.h	2016-10-01 21:08:24.325453196 +1000
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/cpuidle.h>
+#include <linux/skip_lists.h>
 #include <linux/stop_machine.h>
 
 #ifndef BFS_SCHED_H
@@ -13,8 +14,7 @@ struct rq {
 	struct task_struct *curr, *idle, *stop;
 	struct mm_struct *prev_mm;
 
-	/* Pointer to grq spinlock */
-	raw_spinlock_t *grq_lock;
+	raw_spinlock_t lock;
 
 	/* Stored data about rq->curr to work outside grq lock */
 	u64 rq_deadline;
@@ -23,7 +23,7 @@ struct rq {
 	u64 rq_last_ran;
 	int rq_prio;
 	bool rq_running; /* There is a task running */
-	int soft_affined; /* Running or queued tasks with this set as their rq */
+
 	u64 load_update; /* When we last updated load */
 	unsigned long load_avg; /* Rolling load average */
 #ifdef CONFIG_SMT_NICE
@@ -36,6 +36,8 @@ struct rq {
 		iowait_pc, idle_pc;
 	atomic_t nr_iowait;
 
+	skiplist_node node;
+	skiplist *sl;
 #ifdef CONFIG_SMP
 	int cpu;		/* cpu of this runqueue */
 	bool online;
@@ -43,6 +45,10 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;
 	int *cpu_locality; /* CPU relative cache distance */
+	struct rq **rq_order; /* RQs ordered by relative cache distance */
+
+	unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
+	u64 niffies; /* Last time this RQ updated rq clock */
 #ifdef CONFIG_SCHED_SMT
 	cpumask_t thread_mask;
 	bool (*siblings_idle)(struct rq *rq);
@@ -53,7 +59,6 @@ struct rq {
 	bool (*cache_idle)(struct rq *rq);
 	/* See if all cache siblings are idle */
 #endif /* CONFIG_SCHED_MC */
-	u64 last_niffy; /* Last time this RQ updated grq.niffies */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64 prev_irq_time;
@@ -118,13 +123,13 @@ static inline u64 __rq_clock_broken(stru
 
 static inline u64 rq_clock(struct rq *rq)
 {
-	lockdep_assert_held(rq->grq_lock);
+	lockdep_assert_held(&rq->lock);
 	return rq->clock;
 }
 
 static inline u64 rq_clock_task(struct rq *rq)
 {
-	lockdep_assert_held(rq->grq_lock);
+	lockdep_assert_held(&rq->lock);
 	return rq->clock_task;
 }
 
Index: linux-4.7-bfs504/kernel/skip_lists.c
===================================================================
--- linux-4.7-bfs504.orig/kernel/skip_lists.c	2016-09-23 08:59:12.374546442 +1000
+++ linux-4.7-bfs504/kernel/skip_lists.c	2016-09-30 06:35:43.081468631 +1000
@@ -33,7 +33,7 @@ occurs in O(log n) time.
 
 delnode(slnode, l, node): deletes any binding of key from the l based on the
 actual node value. This operation occurs in O(k) time where k is the
-number of levels of the node in question (max 16). The original delete
+number of levels of the node in question (max 8). The original delete
 function occurred in O(log n) time and involved a search.
 
 BFS Notes: In this implementation of skiplists, there are bidirectional
@@ -51,7 +51,7 @@ aid of prev<->next pointer manipulation
 #include <linux/slab.h>
 #include <linux/skip_lists.h>
 
-#define MaxNumberOfLevels 16
+#define MaxNumberOfLevels 8
 #define MaxLevel (MaxNumberOfLevels - 1)
 
 void skiplist_init(skiplist_node *slnode)
@@ -111,9 +111,7 @@ static inline unsigned int randomLevel(i
 {
 	unsigned int mask;
 
-	if (entries > 31)
-		mask = 0xF;
-	else if (entries > 15)
+	if (entries > 15)
 		mask = 0x7;
 	else if (entries > 7)
 		mask = 0x3;
@@ -139,6 +137,8 @@ void skiplist_insert(skiplist *l, skipli
 	} while (--k >= 0);
 
 	k = randomLevel(++l->entries, randseed);
+	if (k > MaxLevel)
+		k = MaxLevel;
 	if (k > l->level) {
 		k = ++l->level;
 		update[k] = l->header;
Index: linux-4.7-bfs504/include/linux/sched.h
===================================================================
--- linux-4.7-bfs504.orig/include/linux/sched.h	2016-09-23 08:59:12.367546205 +1000
+++ linux-4.7-bfs504/include/linux/sched.h	2016-10-01 10:20:37.000000000 +1000
@@ -1953,7 +1953,6 @@ extern int arch_task_struct_size __read_
 #endif
 
 #ifdef CONFIG_SCHED_BFS
-bool grunqueue_is_locked(void);
 void grq_unlock_wait(void);
 void cpu_scaling(int cpu);
 void cpu_nonscaling(int cpu);
@@ -1964,11 +1963,6 @@ static inline void tsk_cpus_current(stru
 {
 }
 
-static inline int runqueue_is_locked(int cpu)
-{
-	return grunqueue_is_locked();
-}
-
 void print_scheduler_version(void);
 
 static inline bool iso_task(struct task_struct *p)
@@ -1976,7 +1970,6 @@ static inline bool iso_task(struct task_
 	return (p->policy == SCHED_ISO);
 }
 #else /* CFS */
-extern int runqueue_is_locked(int cpu);
 static inline void cpu_scaling(int cpu)
 {
 }