The MuQSS (Multiple Queue Skiplist Scheduler - pronounced mux) v0.103 by
Con Kolivas.

This is a multiple runqueue skiplist variant of the Brain Fuck Scheduler,
designed to provide excellent latency, throughput and scalability to any
number of CPUs, with primary emphasis on latency for interactivity and
responsiveness.

A multiple runqueue strict fairness earliest effective virtual deadline first
design.

Runqueue insertion is O(log(n)), lookup is O(1), removal is amortised O(1).

Scalability is optimal when your workload is equal to the number of CPUs on
bfs variants.
i.e. you should ONLY do make -j4 on quad core, -j2 on dual core and so on.

Interactive mode is enabled by default but can be disabled for improved
throughput at the expense of deterministic low latency and fairness.

echo 0 > /proc/sys/kernel/interactive

Features SCHED_IDLEPRIO and SCHED_ISO scheduling policies as well.
You do NOT need to use these policies for good performance, they are purely
optional for even better performance in extreme conditions.

To run something idleprio, use schedtool like so:

schedtool -D -e make -j4

To run something isoprio, use schedtool like so:

schedtool -I -e amarok

Includes configurable SMT-nice support for better nice level and scheduling
policy support across SMT (aka hyperthread) sibling CPUs.

Includes accurate sub-tick accounting of tasks so userspace reported
cpu usage may be very different if you have very short lived tasks.

-ck

Index: linux-4.7-bfs504/kernel/sched/bfs.c
===================================================================
--- linux-4.7-bfs504.orig/kernel/sched/bfs.c	2016-09-27 14:28:18.223173282 +1000
+++ linux-4.7-bfs504/kernel/sched/bfs.c	2016-10-02 14:13:06.748962233 +1100
@@ -115,7 +115,7 @@
 
 #define rq_idle(rq)		((rq)->rq_prio == PRIO_LIMIT)
 
-#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
+#define ISO_PERIOD		((5 * HZ * num_online_cpus()) + 1)
 
 #define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
 #define STOP_PRIO		(MAX_RT_PRIO - 1)
@@ -137,7 +137,7 @@
 
 void print_scheduler_version(void)
 {
-	printk(KERN_INFO "BFS CPU scheduler v0.502 by Con Kolivas.\n");
+	printk(KERN_INFO "MuQSS CPU scheduler v0.103 by Con Kolivas.\n");
 }
 
 /*
@@ -174,30 +174,21 @@ static inline int timeslice(void)
 }
 
 /*
- * The global runqueue data that all CPUs work off. Data is protected either
- * by the global grq lock, or the discrete lock that precedes the data in this
- * struct.
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables or iso variables protected by iso_lock.
  */
 struct global_rq {
-	raw_spinlock_t lock;
-	unsigned long nr_running;
-	unsigned long nr_uninterruptible;
-	unsigned long long nr_switches;
-	unsigned long qnr; /* queued not running */
+	atomic_t nr_running;
+	atomic_t nr_uninterruptible;
+	atomic64_t nr_switches;
+	atomic_t qnr; /* queued not running */
 #ifdef CONFIG_SMP
 	cpumask_t cpu_idle_map;
 	bool idle_cpus;
 #endif
-	int noc; /* num_online_cpus stored and updated when it changes */
-	u64 niffies; /* Nanosecond jiffies */
-	unsigned long last_jiffy; /* Last jiffy we updated niffies */
-
 	raw_spinlock_t iso_lock;
 	int iso_ticks;
 	bool iso_refractory;
-
-	skiplist_node node;
-	skiplist *sl;
 };
 
 #ifdef CONFIG_SMP
@@ -296,10 +287,16 @@ static inline int cpu_of(struct rq *rq)
 {
 	return rq->cpu;
 }
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+	return 0;
+}
+#endif
 
 /*
  * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
+ * clock is updated with the rq->lock held, it is an opportunity to update the
  * niffies value. Any CPU can update it by adding how much its clock has
  * increased since it last updated niffies, minus any added niffies by other
  * CPUs.
@@ -311,36 +308,13 @@ static inline void update_clocks(struct
 
 	update_rq_clock(rq);
 	ndiff = rq->clock - rq->old_clock;
-	/* old_clock is only updated when we are updating niffies */
 	rq->old_clock = rq->clock;
-	ndiff -= grq.niffies - rq->last_niffy;
-	jdiff = jiffies - grq.last_jiffy;
+	jdiff = jiffies - rq->last_jiffy;
 	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
-	rq->last_niffy = grq.niffies;
-}
-#else /* CONFIG_SMP */
-static inline int cpu_of(struct rq *rq)
-{
-	return 0;
+	rq->last_jiffy += jdiff;
+	rq->niffies += ndiff;
 }
 
-static inline void update_clocks(struct rq *rq)
-{
-	s64 ndiff;
-	long jdiff;
-
-	update_rq_clock(rq);
-	ndiff = rq->clock - rq->old_clock;
-	rq->old_clock = rq->clock;
-	jdiff = jiffies - grq.last_jiffy;
-	niffy_diff(&ndiff, jdiff);
-	grq.last_jiffy += jdiff;
-	grq.niffies += ndiff;
-}
-#endif
-
 #include "stats.h"
 
 #ifndef prepare_arch_switch
@@ -354,10 +328,10 @@ static inline void update_clocks(struct
 #endif
 
 /*
- * All common locking functions performed on grq.lock. rq->clock is local to
+ * All common locking functions performed on rq->lock. rq->clock is local to
  * the CPU accessing it so it can be modified just with interrupts disabled
  * when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
+ * Looking up task_rq must be done under rq->lock to be safe.
  */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 
@@ -376,129 +350,246 @@ static inline bool task_running(struct t
 	return p->on_cpu;
 }
 
-static inline void grq_lock(void)
-	__acquires(grq.lock)
+static inline void rq_lock(struct rq *rq)
+	__acquires(rq->lock)
 {
-	raw_spin_lock(&grq.lock);
+	raw_spin_lock(&rq->lock);
 }
 
-static inline void grq_unlock(void)
-	__releases(grq.lock)
+static inline int rq_trylock(struct rq *rq)
+	__acquires(rq->lock)
 {
-	raw_spin_unlock(&grq.lock);
+	return raw_spin_trylock(&rq->lock);
 }
 
-static inline void grq_lock_irq(void)
-	__acquires(grq.lock)
+static inline void rq_unlock(struct rq *rq)
+	__releases(rq->lock)
 {
-	raw_spin_lock_irq(&grq.lock);
+	raw_spin_unlock(&rq->lock);
 }
 
-static inline void time_lock_grq(struct rq *rq)
-	__acquires(grq.lock)
+static inline struct rq *this_rq_lock(void)
+	__acquires(rq->lock)
 {
-	grq_lock();
-	update_clocks(rq);
+	struct rq *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+
+	return rq;
 }
 
-static inline void grq_unlock_irq(void)
-	__releases(grq.lock)
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+
+/* For when we know rq1 != rq2 */
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
 {
-	raw_spin_unlock_irq(&grq.lock);
+	if (rq1 < rq2) {
+		raw_spin_lock(&rq1->lock);
+		raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		raw_spin_lock(&rq2->lock);
+		raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+	}
 }
 
-static inline void grq_lock_irqsave(unsigned long *flags)
-	__acquires(grq.lock)
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
 {
-	raw_spin_lock_irqsave(&grq.lock, *flags);
+	BUG_ON(!irqs_disabled());
+	if (rq1 == rq2) {
+		raw_spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else
+		__double_rq_lock(rq1, rq2);
 }
 
-static inline void grq_unlock_irqrestore(unsigned long *flags)
-	__releases(grq.lock)
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
 {
-	raw_spin_unlock_irqrestore(&grq.lock, *flags);
+	raw_spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		raw_spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
 }
 
-static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+/* Must be sure rq1 != rq2 and irqs are disabled */
+static inline void lock_second_rq(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
 {
-	grq_lock_irqsave(flags);
-	return task_rq(p);
+	BUG_ON(!irqs_disabled());
+	if (unlikely(!raw_spin_trylock(&rq2->lock))) {
+		raw_spin_unlock(&rq1->lock);
+		__double_rq_lock(rq1, rq2);
+	}
 }
 
-static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+static inline void lock_all_rqs(void)
 {
-	struct rq *rq = task_grq_lock(p, flags);
-	update_clocks(rq);
-	return rq;
+	int cpu;
+
+	preempt_disable();
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_lock(&rq->lock);
+	}
 }
 
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+static inline void unlock_all_rqs(void)
 {
-	grq_lock_irq();
-	return task_rq(p);
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_unlock(&rq->lock);
+	}
+	preempt_enable();
 }
 
-static inline void time_task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+/*
+ * Lock this_rq and as many rqs as we can grab with trylock, returning which
+ * rqs are locked in a bitmask.
+ */
+static inline void lock_rqs(struct rq *this_rq, cpumask_t *mask)
 {
-	struct rq *rq = task_grq_lock_irq(p);
+	int cpu;
+
+	cpumask_clear(mask);
+
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		if (rq != this_rq) {
+			if (!do_raw_spin_trylock(&rq->lock))
+				continue;
+			spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+		}
+		cpumask_set_cpu(cpu, mask);
+	}
+}
+
+/* Unlock all rqs in a CPU bitmask */
+static inline void unlock_rqs(struct rq *this_rq, cpumask_t *mask)
+{
+	int cpu;
+
+	cpumask_clear_cpu(this_rq->cpu, mask);
+
+	for_each_cpu(cpu, mask) {
+		struct rq *rq = cpu_rq(cpu);
+
+		spin_release(&rq->lock.dep_map, 1, _RET_IP_);
+		do_raw_spin_unlock(&rq->lock);
+	}
+}
+
+static inline void rq_lock_irq(struct rq *rq)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irq(&rq->lock);
+}
+
+static inline void time_lock_rq(struct rq *rq)
+{
+	rq_lock(rq);
 	update_clocks(rq);
 }
 
-static inline void task_grq_unlock_irq(void)
-	__releases(grq.lock)
+static inline void rq_unlock_irq(struct rq *rq)
+	__releases(rq->lock)
 {
-	grq_unlock_irq();
+	raw_spin_unlock_irq(&rq->lock);
 }
 
-static inline void task_grq_unlock(unsigned long *flags)
-	__releases(grq.lock)
+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags)
+	__acquires(rq->lock)
 {
-	grq_unlock_irqrestore(flags);
+	raw_spin_lock_irqsave(&rq->lock, *flags);
 }
 
-/**
- * grunqueue_is_locked
- *
- * Returns true if the global runqueue is locked.
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-bool grunqueue_is_locked(void)
+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags)
+	__releases(rq->lock)
 {
-	return raw_spin_is_locked(&grq.lock);
+	raw_spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
-void grq_unlock_wait(void)
-	__releases(grq.lock)
+static inline struct rq
+*task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(p->pi_lock)
+	__acquires(rq->lock)
 {
-	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-	raw_spin_unlock_wait(&grq.lock);
+	struct rq *rq;
+
+	while (42) {
+		raw_spin_lock_irqsave(&p->pi_lock, *flags);
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+	}
+	return rq;
 }
 
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
-	__acquires(grq.lock)
+static inline struct rq
+*time_task_rq_lock(struct task_struct *p, unsigned long *flags)
 {
-	local_irq_save(*flags);
-	time_lock_grq(rq);
+	struct rq *rq = task_rq_lock(p, flags);
+
+	update_clocks(rq);
+	return rq;
 }
 
-static inline struct rq *__task_grq_lock(struct task_struct *p)
-	__acquires(grq.lock)
+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+	__releases(rq->lock)
+	__releases(p->pi_lock)
 {
-	grq_lock();
-	return task_rq(p);
+	rq_unlock(rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
-static inline void __task_grq_unlock(void)
-	__releases(grq.lock)
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+	__acquires(rq->lock)
 {
-	grq_unlock();
+	struct rq *rq;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	while (42) {
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+	}
+	return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+{
+	rq_unlock(rq);
 }
 
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
@@ -509,16 +600,16 @@ static inline void finish_lock_switch(st
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
-	grq.lock.owner = current;
+	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
-	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 
-	grq_unlock_irq();
+	raw_spin_unlock_irq(&rq->lock);
 }
 
 static inline bool deadline_before(u64 deadline, u64 time)
@@ -532,6 +623,40 @@ static inline bool deadline_after(u64 de
 }
 
 /*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+	return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+	return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+	return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+	return NS_TO_MS(longest_deadline_diff());
+}
+
+/*
  * A task that is not running or queued will not have a node set.
  * A task that is queued but not running will have a node set.
  * A task that is currently running will have ->on_cpu set but no node set.
@@ -541,17 +666,53 @@ static inline bool task_queued(struct ta
 	return !skiplist_node_empty(&p->node);
 }
 
+static unsigned long rq_load_avg(struct rq *rq)
+{
+	return rq->sl->entries * SCHED_CAPACITY_SCALE;
+}
+
 /*
- * Removing from the global runqueue. Enter with grq locked. Deleting a task
+ * Update the load average for feeding into cpu frequency governors. Use a
+ * rough estimate of a rolling average with ~ time constant of 32ms.
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
+ */
+static void update_load_avg(struct rq *rq)
+{
+	/* rq clock can go backwards so skip update if that happens */
+	if (likely(rq->clock > rq->load_update)) {
+		unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
+		long load;
+
+		load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+		if (unlikely(load < 0))
+			load = 0;
+		load += rq->sl->entries * rq_load_avg(rq) * us_interval * 5 / 262144;
+		rq->load_avg = load;
+	}
+	rq->load_update = rq->clock;
+}
+
+/*
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
  * from the skip list is done via the stored node reference in the task struct
  * and does not require a full look up. Thus it occurs in O(k) time where k
- * is the "level" of the list the task was stored at - usually < 4, max 16.
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
  */
-static void dequeue_task(struct task_struct *p)
+static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
-	skiplist_delete(grq.sl, &p->node);
+	skiplist_delete(rq->sl, &p->node);
 	sched_info_dequeued(task_rq(p), p);
+	update_load_avg(rq);
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+	return p->rcu_read_unlock_special.b.blocked;
 }
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
 
 /*
  * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
@@ -559,8 +720,8 @@ static void dequeue_task(struct task_str
  */
 static bool idleprio_suitable(struct task_struct *p)
 {
-	return (!freezing(p) && !signal_pending(p) &&
-		!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
+	return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+		!signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
 }
 
 /*
@@ -573,7 +734,18 @@ static bool isoprio_suitable(void)
 }
 
 /*
- * Adding to the global runqueue. Enter with grq locked.
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
+		return true;
+	return false;
+}
+
+/*
+ * Adding to the runqueue. Enter with rq locked.
  */
 static void enqueue_task(struct task_struct *p, struct rq *rq)
 {
@@ -604,17 +776,22 @@ static void enqueue_task(struct task_str
 		sl_id = p->prio;
 	else {
 		sl_id = p->deadline;
-		/* Set it to cope with 4 left shifts with locality_diff */
-		if (p->prio == IDLE_PRIO)
-			sl_id |= 0x0F00000000000000;
+		if (idleprio_task(p)) {
+			/* Set it to cope with 4 left shifts with locality_diff */
+			if (p->prio == IDLE_PRIO)
+				sl_id |= 0x00FF000000000000;
+			else
+				sl_id += longest_deadline_diff();
+		}
 	}
 	/*
 	 * Some architectures don't have better than microsecond resolution
 	 * so mask out ~microseconds as the random seed for skiplist insertion.
 	 */
-	randseed = (grq.niffies >> 10) & 0xFFFFFFFF;
-	skiplist_insert(grq.sl, &p->node, sl_id, p, randseed);
+	randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
+	skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
 	sched_info_queued(rq, p);
+	update_load_avg(rq);
 }
 
 static inline void requeue_task(struct task_struct *p)
@@ -643,7 +820,7 @@ static inline int task_timeslice(struct
 
 static void resched_task(struct task_struct *p);
 
-static inline void resched_curr(struct rq *rq)
+static void resched_curr(struct rq *rq)
 {
 	resched_task(rq->curr);
 }
@@ -655,22 +832,17 @@ static inline void resched_curr(struct r
  */
 static inline void inc_qnr(void)
 {
-	grq.qnr++;
+	atomic_inc(&grq.qnr);
 }
 
 static inline void dec_qnr(void)
 {
-	grq.qnr--;
+	atomic_dec(&grq.qnr);
 }
 
 static inline int queued_notrunning(void)
 {
-	return grq.qnr;
-}
-
-static unsigned long rq_load_avg(struct rq *rq)
-{
-	return rq->soft_affined * SCHED_CAPACITY_SCALE;
+	return atomic_read(&grq.qnr);
 }
 
 #ifdef CONFIG_SMT_NICE
@@ -749,20 +921,33 @@ static bool smt_should_schedule(struct t
 #define smt_schedule(p, this_rq) (true)
 #endif /* CONFIG_SMT_NICE */
 #ifdef CONFIG_SMP
+
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
+{
+	set_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
 /*
  * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
  * allow easy lookup of whether any suitable idle CPUs are available.
  * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
- * idle_cpus variable than to do a full bitmask check when we are busy.
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
+ * bits are set atomically but read locklessly as occasional false positive /
+ * negative is harmless.
  */
 static inline void set_cpuidle_map(int cpu)
 {
 	if (likely(cpu_online(cpu))) {
-		cpumask_set_cpu(cpu, &grq.cpu_idle_map);
+		atomic_set_cpu(cpu, &grq.cpu_idle_map);
 		grq.idle_cpus = true;
 	}
 }
 
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
+{
+	clear_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
 static inline void clear_cpuidle_map(int cpu)
 {
 	cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
@@ -932,28 +1117,7 @@ static int effective_prio(struct task_st
 }
 
 /*
- * Update the load average for feeding into cpu frequency governors. Use a
- * rough estimate of a rolling average with ~ time constant of 32ms.
- * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
- */
-static void update_load_avg(struct rq *rq)
-{
-	/* rq clock can go backwards so skip update if that happens */
-	if (likely(rq->clock > rq->load_update)) {
-		unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
-		long load;
-
-		load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
-		if (unlikely(load < 0))
-			load = 0;
-		load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144;
-		rq->load_avg = load;
-	}
-	rq->load_update = rq->clock;
-}
-
-/*
- * activate_task - move a task to the runqueue. Enter with grq locked.
+ * activate_task - move a task to the runqueue. Enter with rq locked.
  */
 static void activate_task(struct task_struct *p, struct rq *rq)
 {
@@ -972,92 +1136,100 @@ static void activate_task(struct task_st
 
 	p->prio = effective_prio(p);
 	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible--;
+		atomic_dec(&grq.nr_uninterruptible);
 	enqueue_task(p, rq);
-	rq->soft_affined++;
 	p->on_rq = 1;
-	grq.nr_running++;
+	atomic_inc(&grq.nr_running);
 	inc_qnr();
 	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
+	cpufreq_trigger(rq->niffies, rq->load_avg);
 }
 
 /*
- * deactivate_task - If it's running, it's not on the grq and we can just
- * decrement the nr_running. Enter with grq locked.
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
+ * decrement the nr_running. Enter with rq locked.
  */
 static inline void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	if (task_contributes_to_load(p))
-		grq.nr_uninterruptible++;
-	rq->soft_affined--;
+		atomic_inc(&grq.nr_uninterruptible);
+
 	p->on_rq = 0;
-	grq.nr_running--;
+	atomic_dec(&grq.nr_running);
 	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
+	cpufreq_trigger(rq->niffies, rq->load_avg);
 }
 
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
-	unsigned int tcpu;
+	struct rq *rq = task_rq(p);
+	bool queued;
 
 #ifdef CONFIG_LOCKDEP
 	/*
-	 * The caller should hold grq lock.
+	 * The caller should hold either p->pi_lock or rq->lock, when changing
+	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+	 *
+	 * Furthermore, all task_rq users should acquire both locks, see
+	 * task_rq_lock().
 	 */
-	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+				      lockdep_is_held(&task_rq(p)->lock)));
 #endif
-	if ((tcpu = task_cpu(p)) == cpu)
+	if (task_cpu(p) == cpu)
 		return;
 	trace_sched_migrate_task(p, cpu);
 	perf_event_task_migrate(p);
 
 	/*
-	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfully executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
-	if (p->on_rq) {
-		struct rq *rq = task_rq(p);
 
-		rq->soft_affined--;
-		update_load_avg(rq);
-		rq = cpu_rq(cpu);
-		rq->soft_affined++;
-		update_load_avg(rq);
-	}
+	if ((queued = task_queued(p)))
+		dequeue_task(p, rq);
 	task_thread_info(p)->cpu = cpu;
+	if (queued)
+		enqueue_task(p, cpu_rq(cpu));
 }
 #endif /* CONFIG_SMP */
 
 /*
- * Move a task off the global queue and take it to a cpu for it will
+ * Move a task off the runqueue and take it to a cpu for it will
  * become the running task.
  */
-static inline void take_task(int cpu, struct task_struct *p)
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
 {
+	dequeue_task(p, task_rq(p));
 	set_task_cpu(p, cpu);
-	dequeue_task(p);
 	dec_qnr();
 }
 
 /*
- * Returns a descheduling task to the grq runqueue unless it is being
+ * Returns a descheduling task to the runqueue unless it is being
  * deactivated.
  */
-static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
+static inline bool return_task(struct task_struct *p, struct rq *rq,
+			       int cpu, bool deactivate)
 {
+	bool ret = true;
+
 	if (deactivate)
 		deactivate_task(p, rq);
 	else {
 		inc_qnr();
-		enqueue_task(p, rq);
+		if (unlikely(needs_other_cpu(p, cpu)))
+			ret = false;
+		else
+			enqueue_task(p, rq);
 	}
+	return ret;
 }
 
-/* Enter with grq lock held. We know p is on the local cpu */
+/* Enter with rq lock held. We know p is on the local cpu */
 static inline void __set_tsk_resched(struct task_struct *p)
 {
 	set_tsk_need_resched(p);
@@ -1075,11 +1247,10 @@ void resched_task(struct task_struct *p)
 {
 	int cpu;
 
-	lockdep_assert_held(&grq.lock);
-
 	if (test_tsk_need_resched(p))
 		return;
 
+	/* We're doing this without holding the rq lock if it's not task_rq */
 	set_tsk_need_resched(p);
 
 	cpu = task_cpu(p);
@@ -1151,14 +1322,14 @@ unsigned long wait_task_inactive(struct
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
-		rq = task_grq_lock(p, &flags);
+		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(p);
 		on_rq = p->on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_grq_unlock(&flags);
+		task_rq_unlock(rq, p, &flags);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -1271,17 +1442,6 @@ static inline bool online_cpus(struct ta
 }
 #endif
 
-/*
- * Check to see if p can run on cpu, and if not, whether there are any online
- * CPUs it can run on instead.
- */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
-	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
-		return true;
-	return false;
-}
-
 static void try_preempt(struct task_struct *p, struct rq *this_rq)
 {
 	int cpu, pcpu, highest_prio, highest_cpu;
@@ -1307,6 +1467,8 @@ static void try_preempt(struct task_stru
 		}
 		cpumask_clear_cpu(pcpu, &tmp);
 	}
+	if (!sched_interactive)
+		return;
 
 	highest_prio = latest_deadline = 0;
 	highest_prio_rq = NULL;
@@ -1315,19 +1477,15 @@ static void try_preempt(struct task_stru
 	for_each_cpu(cpu, &tmp) {
 		struct rq *rq;
 		int rq_prio;
-		u64 dl;
 
 		rq = cpu_rq(cpu);
 		rq_prio = rq->rq_prio;
 		if (rq_prio < highest_prio)
 			continue;
 
-		dl = rq->rq_deadline;
-		if (!sched_interactive && pcpu != cpu)
-			dl <<= locality_diff(pcpu, rq);
 		if (rq_prio > highest_prio ||
-		    deadline_after(dl, latest_deadline)) {
-			latest_deadline = dl;
+		    deadline_after(rq->rq_deadline, latest_deadline)) {
+			latest_deadline = rq->rq_deadline;
 			highest_prio = rq_prio;
 			highest_cpu = cpu;
 			highest_prio_rq = rq;
@@ -1338,15 +1496,8 @@ static void try_preempt(struct task_stru
 		return;
 	if (!smt_schedule(p, highest_prio_rq))
 		return;
-	if (can_preempt(p, highest_prio, latest_deadline)) {
-		/*
-		 * If we have decided this task should preempt this CPU,
-		 * set the task's CPU to match thereby speeding up matching
-		 * this task in earliest_deadline_task.
-		 */
-		set_task_cpu(p, highest_cpu);
+	if (can_preempt(p, highest_prio, latest_deadline))
 		resched_curr(highest_prio_rq);
-	}
 }
 static int __set_cpus_allowed_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask, bool check);
@@ -1411,11 +1562,11 @@ void wake_up_if_idle(int cpu)
 	if (!is_idle_task(rcu_dereference(rq->curr)))
 		goto out;
 
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 	if (likely(is_idle_task(rq->curr)))
 		smp_send_reschedule(cpu);
 	/* Else cpu is not in idle, do nothing here */
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 
 out:
 	rcu_read_unlock();
@@ -1493,8 +1644,6 @@ static bool try_to_wake_up(struct task_s
 	struct rq *rq;
 	int cpu;
 
-	get_cpu();
-
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
 	 * need to ensure that CONDITION=1 done by the caller can not be
@@ -1507,7 +1656,7 @@ static bool try_to_wake_up(struct task_s
 	 * No need to do time_lock_grq as we only need to update the rq clock
 	 * if we activate the task
 	 */
-	rq = task_grq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	cpu = task_cpu(p);
 
 	/* state is a volatile long, どうして、分からない */
@@ -1525,13 +1674,11 @@ static bool try_to_wake_up(struct task_s
 out_running:
 	ttwu_post_activation(p, rq, success);
 out_unlock:
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	if (schedstat_enabled())
 		ttwu_stat(p, cpu, wake_flags);
 
-	put_cpu();
-
 	return success;
 }
 
@@ -1548,10 +1695,26 @@ static void try_to_wake_up_local(struct
 	struct rq *rq = task_rq(p);
 	bool success = false;
 
-	lockdep_assert_held(&grq.lock);
+	if (WARN_ON_ONCE(rq != this_rq()) ||
+	    WARN_ON_ONCE(p == current))
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		raw_spin_unlock(&rq->lock);
+		raw_spin_lock(&p->pi_lock);
+		raw_spin_lock(&rq->lock);
+	}
 
 	if (!(p->state & TASK_NORMAL))
-		return;
+		goto out;
 
 	trace_sched_waking(p);
 
@@ -1566,6 +1729,8 @@ static void try_to_wake_up_local(struct
 		success = true;
 	}
 	ttwu_post_activation(p, rq, success);
+out:
+	raw_spin_unlock(&p->pi_lock);
 }
 
 /**
@@ -1591,7 +1756,7 @@ int wake_up_state(struct task_struct *p,
 	return try_to_wake_up(p, state, 0);
 }
 
-static void time_slice_expired(struct task_struct *p);
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
 
 /*
  * Perform scheduler related setup for a newly forked process p.
@@ -1599,6 +1764,9 @@ static void time_slice_expired(struct ta
  */
 int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
 {
+	unsigned long flags;
+	int cpu = get_cpu();
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
@@ -1641,12 +1809,21 @@ int sched_fork(unsigned long __maybe_unu
 		p->sched_reset_on_fork = 0;
 	}
 
+	/*
+	 * Silence PROVE_RCU.
+	 */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	set_task_cpu(p, cpu);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
 #ifdef CONFIG_SCHED_INFO
 	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 	p->on_cpu = false;
 	init_task_preempt_count(p);
+
+	put_cpu();
 	return 0;
 }
 
@@ -1736,12 +1913,15 @@ static inline void init_schedstats(void)
  */
 void wake_up_new_task(struct task_struct *p)
 {
-	struct task_struct *parent;
+	struct task_struct *parent, *rq_curr;
 	unsigned long flags;
 	struct rq *rq;
 
 	parent = p->parent;
-	rq = task_grq_lock(p, &flags);
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	rq = __task_rq_lock(p);
+	rq_curr = rq->curr;
 
 	/*
 	 * Reinit new task deadline as its creator deadline could have changed
@@ -1750,21 +1930,12 @@ void wake_up_new_task(struct task_struct
 	p->deadline = rq->rq_deadline;
 
 	/*
-	 * If the task is a new process, current and parent are the same. If
-	 * the task is a new thread in the thread group, it will have much more
-	 * in common with current than with the parent.
-	 */
-	set_task_cpu(p, task_cpu(rq->curr));
-
-	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
-	p->prio = rq->curr->normal_prio;
+	p->prio = rq_curr->normal_prio;
 
 	activate_task(p, rq);
 	trace_sched_wakeup_new(p);
-	if (unlikely(p->policy == SCHED_FIFO))
-		goto after_ts_init;
 
 	/*
 	 * Share the timeslice between parent and child, thus the
@@ -1776,33 +1947,37 @@ void wake_up_new_task(struct task_struct
 	 * is always equal to current->deadline.
 	 */
 	p->last_ran = rq->rq_last_ran;
-	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+	if (likely(rq_curr->policy != SCHED_FIFO)) {
 		rq->rq_time_slice /= 2;
-		p->time_slice = rq->rq_time_slice;
-after_ts_init:
-		if (rq->curr == parent && !suitable_idle_cpus(p)) {
+		if (unlikely(rq->rq_time_slice < RESCHED_US)) {
 			/*
-			 * The VM isn't cloned, so we're in a good position to
-			 * do child-runs-first in anticipation of an exec. This
-			 * usually avoids a lot of COW overhead.
+			 * Forking task has run out of timeslice. Reschedule it and
+			 * start its child with a new time slice and deadline. The
+			 * child will end up running first because its deadline will
+			 * be slightly earlier.
 			 */
-			__set_tsk_resched(parent);
-		} else
-			try_preempt(p, rq);
-	} else {
-		if (rq->curr == parent) {
-			/*
-		 	* Forking task has run out of timeslice. Reschedule it and
-		 	* start its child with a new time slice and deadline. The
-		 	* child will end up running first because its deadline will
-		 	* be slightly earlier.
-		 	*/
 			rq->rq_time_slice = 0;
-			__set_tsk_resched(parent);
+			__set_tsk_resched(rq_curr);
+			time_slice_expired(p, rq);
+			if (suitable_idle_cpus(p))
+				resched_best_idle(p);
+		} else {
+			p->time_slice = rq->rq_time_slice;
+			if (rq_curr == parent && !suitable_idle_cpus(p)) {
+				/*
+				 * The VM isn't cloned, so we're in a good position to
+				 * do child-runs-first in anticipation of an exec. This
+				 * usually avoids a lot of COW overhead.
+				 */
+				__set_tsk_resched(rq_curr);
+			} else
+				try_preempt(p, rq);
 		}
-		time_slice_expired(p);
+	} else {
+		time_slice_expired(p, rq);
+		try_preempt(p, rq);
 	}
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1936,7 +2111,7 @@ prepare_task_switch(struct rq *rq, struc
  * because prev may have moved to another CPU.
  */
 static struct rq *finish_task_switch(struct task_struct *prev)
-	__releases(grq.lock)
+	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
@@ -1996,7 +2171,7 @@ static struct rq *finish_task_switch(str
  * @prev: the thread we just switched away from.
  */
 asmlinkage __visible void schedule_tail(struct task_struct *prev)
-	__releases(grq.lock)
+	__releases(rq->lock)
 {
 	struct rq *rq;
 
@@ -2053,7 +2228,7 @@ context_switch(struct rq *rq, struct tas
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
@@ -2066,26 +2241,16 @@ context_switch(struct rq *rq, struct tas
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup. All are
- * measured without grabbing the grq lock but the occasional inaccurate result
- * doesn't matter so long as it's positive.
+ * threads, total number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
-	long nr = grq.nr_running;
-
-	if (unlikely(nr < 0))
-		nr = 0;
-	return (unsigned long)nr;
+	return atomic_read(&grq.nr_running);
 }
 
 static unsigned long nr_uninterruptible(void)
 {
-	long nu = grq.nr_uninterruptible;
-
-	if (unlikely(nu < 0))
-		nu = 0;
-	return nu;
+	return atomic_read(&grq.nr_uninterruptible);
 }
 
 /*
@@ -2103,7 +2268,7 @@ static unsigned long nr_uninterruptible(
  */
 bool single_task_running(void)
 {
-	if (cpu_rq(smp_processor_id())->soft_affined == 1)
+	if (cpu_rq(smp_processor_id())->sl->entries == 1)
 		return true;
 	else
 		return false;
@@ -2112,12 +2277,7 @@ EXPORT_SYMBOL(single_task_running);
 
 unsigned long long nr_context_switches(void)
 {
-	long long ns = grq.nr_switches;
-
-	/* This is of course impossible */
-	if (unlikely(ns < 0))
-		ns = 1;
-	return (unsigned long long)ns;
+	return (unsigned long long)atomic64_read(&grq.nr_switches);
 }
 
 unsigned long nr_iowait(void)
@@ -2149,7 +2309,7 @@ void get_iowait_load(unsigned long *nr_w
 	struct rq *rq = this_rq();
 
 	*nr_waiters = atomic_read(&rq->nr_iowait);
-	*load = rq->soft_affined;
+	*load = rq->sl->entries;
 }
 
 /* Variables and functions for calc_load */
@@ -2665,7 +2825,7 @@ ts_account:
  * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
  *
- * Called with task_grq_lock() held.
+ * Called with task_rq_lock(p) held.
  */
 static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
@@ -2714,9 +2874,9 @@ unsigned long long task_sched_runtime(st
 		return tsk_seruntime(p);
 #endif
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	ns = p->sched_time + do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -2965,19 +3125,17 @@ static void task_running_tick(struct rq
 	} else if (rq->rq_time_slice >= RESCHED_US)
 			return;
 
-	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
 	p = rq->curr;
 
-	grq_lock();
+	rq_lock(rq);
 	requeue_task(p);
 	__set_tsk_resched(p);
-	grq_unlock();
+	rq_unlock(rq);
 }
 
 /*
  * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled. The data modified is all
- * local to struct rq so we don't need to grab grq lock.
+ * We call it with interrupts disabled.
  */
 void scheduler_tick(void)
 {
@@ -2985,11 +3143,10 @@ void scheduler_tick(void)
 	struct rq *rq = cpu_rq(cpu);
 
 	sched_clock_tick();
-	/* grq lock not grabbed, so only update rq clock */
 	update_rq_clock(rq);
 	update_cpu_clock_tick(rq, rq->curr);
 	update_load_avg(rq);
-	cpufreq_trigger(grq.niffies, rq->load_avg);
+	cpufreq_trigger(rq->niffies, rq->load_avg);
 	if (!rq_idle(rq))
 		task_running_tick(rq);
 	else
@@ -3075,47 +3232,13 @@ static inline void preempt_latency_stop(
 #endif
 
 /*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
-	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
-	return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
-	return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
-	return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
-	return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
  * The time_slice is only refilled when it is empty and that is when we set a
  * new deadline.
  */
-static void time_slice_expired(struct task_struct *p)
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
 {
 	p->time_slice = timeslice();
-	p->deadline = grq.niffies + task_deadline_diff(p);
+	p->deadline = rq->niffies + task_deadline_diff(p);
 #ifdef CONFIG_SMT_NICE
 	if (!p->mm)
 		p->smt_bias = 0;
@@ -3142,10 +3265,10 @@ static void time_slice_expired(struct ta
  * SCHED_NORMAL tasks.
 
  */
-static inline void check_deadline(struct task_struct *p)
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
 {
 	if (p->time_slice < RESCHED_US || batch_task(p))
-		time_slice_expired(p);
+		time_slice_expired(p, rq);
 }
 
 #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
@@ -3202,46 +3325,67 @@ found_middle:
  * task in the sorted list, an O(1) operation. The only time it takes longer
  * is if tasks do not have suitable affinity and then we iterate over entries
  * till we find the first that does. Worst case here is no tasks with suitable
- * affinity and taking O(n).
+ * affinity and taking O(k) where k is number of processors.
+ *
+ * As many runqueues as can be locked without contention are grabbed via
+ * lock_rqs and only those runqueues are examined. All balancing between CPUs
+ * is thus done here in an extremely simple first come best fit manner.
+ *
+ * This iterates over runqueues in cache locality order. In interactive mode
+ * it iterates over all CPUs and finds the task with the earliest deadline.
+ * In non-interactive mode it grabs any task on the local runqueue or the
+ * busiest nearest cache CPU.
  */
 static inline struct
 task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
 {
 	struct task_struct *edt = idle;
-	skiplist_node *node = &grq.node;
 	u64 earliest_deadline = ~0ULL;
+	int busiest = 0, i;
+	cpumask_t locked;
 
-	while ((node = node->next[0]) != &grq.node) {
-		struct task_struct *p = node->value;
-		int tcpu;
+	lock_rqs(rq, &locked);
 
-		/* Make sure affinity is ok */
-		if (needs_other_cpu(p, cpu))
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *other_rq = rq->rq_order[i];
+		struct task_struct *p;
+		skiplist_node *node;
+
+		if (!cpumask_test_cpu(other_rq->cpu, &locked))
 			continue;
+		if ((node = other_rq->node.next[0]) == &other_rq->node)
+			continue;
+		p = node->value;
 
 		if (!smt_schedule(p, rq))
 			continue;
 
-		if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
-			u64 dl = p->deadline << locality_diff(tcpu, rq);
+		/* Make sure affinity is ok */
+		if (rq != other_rq && needs_other_cpu(p, cpu))
+			continue;
 
-			if (unlikely(!deadline_before(dl, earliest_deadline)))
-				continue;
-			earliest_deadline = dl;
-			edt = p;
-			/* We continue even though we've found the earliest
-			 * deadline task as the locality offset means there
-			 * may be a better candidate after it. */
+		if (!sched_interactive) {
+			if (rq == other_rq) {
+				edt = p;
+				break;
+			}
+			if (other_rq->sl->entries > busiest) {
+				edt = p;
+				busiest = other_rq->sl->entries;
+			}
 			continue;
 		}
-		/* This wouldn't happen if we encountered a better deadline from
-		 * another CPU and have already set edt. */
-		if (likely(p->deadline < earliest_deadline))
-			edt = p;
-		break;
+
+		if (!deadline_before(p->deadline, earliest_deadline))
+			continue;
+		earliest_deadline = p->deadline;
+		edt = p;
 	}
+
 	if (likely(edt != idle))
-		take_task(cpu, edt);
+		take_task(rq, cpu, edt);
+	unlock_rqs(rq, &locked);
+
 	return edt;
 }
 
@@ -3294,8 +3438,7 @@ static inline void schedule_debug(struct
 
 /*
  * The currently running task's information is all stored in rq local data
- * which is only modified by the local CPU, thereby allowing the data to be
- * changed without grabbing the grq lock.
+ * which is only modified by the local CPU.
  */
 static inline void set_rq_task(struct rq *rq, struct task_struct *p)
 {
@@ -3308,10 +3451,6 @@ static inline void set_rq_task(struct rq
 	rq->rq_mm = p->mm;
 	rq->rq_smt_bias = p->smt_bias;
 #endif
-	if (p != rq->idle)
-		rq->rq_running = true;
-	else
-		rq->rq_running = false;
 }
 
 static void reset_rq_task(struct rq *rq, struct task_struct *p)
@@ -3379,6 +3518,21 @@ static void wake_siblings(struct rq __ma
 #endif
 
 /*
+ * For when a running task has its affinity changed and can no longer run on
+ * the current runqueue and needs to be put on another out of __schedule().
+ */
+static void queue_other_rq(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_rq_lock(p, &flags);
+	if (likely(!task_queued(p)))
+		enqueue_task(p, rq);
+	task_rq_unlock(rq, p, &flags);
+}
+
+/*
  * schedule() is the main scheduler function.
  *
  * The main means of driving the scheduler and thus entering this function are:
@@ -3419,7 +3573,7 @@ static void wake_siblings(struct rq __ma
  */
 static void __sched notrace __schedule(bool preempt)
 {
-	struct task_struct *prev, *next, *idle;
+	struct task_struct *prev, *next, *idle, *queue = NULL;
 	unsigned long *switch_count;
 	bool deactivate = false;
 	struct rq *rq;
@@ -3451,7 +3605,7 @@ static void __sched notrace __schedule(b
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
 	smp_mb__before_spinlock();
-	grq_lock();
+	rq_lock(rq);
 
 	switch_count = &prev->nivcsw;
 	if (!preempt && prev->state) {
@@ -3498,16 +3652,13 @@ static void __sched notrace __schedule(b
 		/* Update all the information stored on struct rq */
 		prev->time_slice = rq->rq_time_slice;
 		prev->deadline = rq->rq_deadline;
-		check_deadline(prev);
+		check_deadline(prev, rq);
 		prev->last_ran = rq->clock_task;
-		return_task(prev, rq, deactivate);
+		if (!return_task(prev, rq, cpu, deactivate))
+			queue = prev;
 	}
 
 	if (unlikely(!queued_notrunning())) {
-		/*
-		 * This CPU is now truly idle as opposed to when idle is
-		 * scheduled as a high priority task in its own right.
-		 */
 		next = idle;
 		schedstat_inc(rq, sched_goidle);
 		set_cpuidle_map(cpu);
@@ -3530,19 +3681,19 @@ static void __sched notrace __schedule(b
 			check_siblings(rq);
 		else
 			wake_siblings(rq);
-		grq.nr_switches++;
+		atomic64_inc(&grq.nr_switches);
 		prev->on_cpu = false;
 		next->on_cpu = true;
 		rq->curr = next;
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
-		rq = context_switch(rq, prev, next); /* unlocks the grq */
-		cpu = cpu_of(rq);
-		idle = rq->idle;
+		rq = context_switch(rq, prev, next); /* unlocks the rq */
+		if (unlikely(queue))
+			queue_other_rq(queue);
 	} else {
 		check_siblings(rq);
-		grq_unlock_irq();
+		rq_unlock_irq(rq);
 	}
 }
 
@@ -3757,13 +3908,12 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	unsigned long flags;
-	int queued, oldprio;
 	struct rq *rq;
+	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
-	rq = task_grq_lock(p, &flags);
+	rq = __task_rq_lock(p);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3785,19 +3935,18 @@ void rt_mutex_setprio(struct task_struct
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 	p->prio = prio;
-	if (task_running(p) && prio > oldprio)
-		resched_task(p);
-	if (queued) {
+	if (task_running(p)){
+		if (prio > oldprio)
+			resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(p, rq);
 		enqueue_task(p, rq);
-		try_preempt(p, rq);
+		if (prio < oldprio)
+			try_preempt(p, rq);
 	}
-
 out_unlock:
-	task_grq_unlock(&flags);
+	__task_rq_unlock(rq);
 }
 
 #endif
@@ -3813,7 +3962,7 @@ static inline void adjust_deadline(struc
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int queued, new_static, old_static;
+	int new_static, old_static;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3824,7 +3973,7 @@ void set_user_nice(struct task_struct *p
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = time_task_grq_lock(p, &flags);
+	rq = time_task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -3835,16 +3984,14 @@ void set_user_nice(struct task_struct *p
 		p->static_prio = new_static;
 		goto out_unlock;
 	}
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 
 	adjust_deadline(p, new_static);
 	old_static = p->static_prio;
 	p->static_prio = new_static;
 	p->prio = effective_prio(p);
 
-	if (queued) {
+	if (task_queued(p)) {
+		dequeue_task(p, rq);
 		enqueue_task(p, rq);
 		if (new_static < old_static)
 			try_preempt(p, rq);
@@ -3854,7 +4001,7 @@ void set_user_nice(struct task_struct *p
 			resched_task(p);
 	}
 out_unlock:
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -3925,7 +4072,7 @@ int task_prio(const struct task_struct *
 		goto out;
 
 	/* Convert to ms to avoid overflows */
-	delta = NS_TO_MS(p->deadline - grq.niffies);
+	delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
 	delta = delta * 40 / ms_longest_deadline_diff();
 	if (delta > 0 && delta <= 80)
 		prio += delta;
@@ -3968,7 +4115,7 @@ static inline struct task_struct *find_p
 	return pid ? find_task_by_vpid(pid) : current;
 }
 
-/* Actually do priority change: must hold grq lock. */
+/* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
 			   int prio, bool keep_boost)
 {
@@ -3994,11 +4141,17 @@ static void __setscheduler(struct task_s
 		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
 	} else
 		p->prio = p->normal_prio;
+
 	if (task_running(p)) {
 		reset_rq_task(rq, p);
 		/* Resched only if we might now be preempted */
-		if (p->prio > oldprio || p->rt_priority > oldrtprio)
+		if (p->prio > oldprio || p->rt_priority < oldrtprio)
 			resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(p, rq);
+		enqueue_task(p, rq);
+		if (p->prio < oldprio || p->rt_priority > oldrtprio)
+			try_preempt(p, rq);
 	}
 }
 
@@ -4023,8 +4176,8 @@ __sched_setscheduler(struct task_struct
 		     const struct sched_param *param, bool user, bool pi)
 {
 	struct sched_param zero_param = { .sched_priority = 0 };
-	int queued, retval, oldpolicy = -1;
 	unsigned long flags, rlim_rtprio = 0;
+	int retval, oldpolicy = -1;
 	int reset_on_fork;
 	struct rq *rq;
 
@@ -4134,20 +4287,17 @@ recheck:
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-	 */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	/*
-	 * To be able to change p->policy safely, the grunqueue lock must be
+	 *
+	 * To be able to change p->policy safely, the runqueue lock must be
 	 * held.
 	 */
-	rq = __task_grq_lock(p);
+	rq = task_rq_lock(p, &flags);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_rq_unlock(rq, p, &flags);
 		return -EINVAL;
 	}
 
@@ -4156,32 +4306,21 @@ recheck:
 	 */
 	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
-
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
 
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_rq_unlock(rq, p, &flags);
 		goto recheck;
 	}
 	update_clocks(rq);
 	p->sched_reset_on_fork = reset_on_fork;
 
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 	__setscheduler(p, rq, policy, param->sched_priority, pi);
-	if (queued) {
-		enqueue_task(p, rq);
-		try_preempt(p, rq);
-	}
-	__task_grq_unlock();
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
@@ -4681,9 +4820,9 @@ long sched_getaffinity(pid_t pid, cpumas
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
-	grq_unlock_irqrestore(&flags);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
 	rcu_read_unlock();
@@ -4740,9 +4879,10 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t
 SYSCALL_DEFINE0(sched_yield)
 {
 	struct task_struct *p;
+	struct rq *rq;
 
 	p = current;
-	grq_lock_irq();
+	rq = this_rq_lock();
 	schedstat_inc(task_rq(p), yld_count);
 	requeue_task(p);
 
@@ -4750,9 +4890,9 @@ SYSCALL_DEFINE0(sched_yield)
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(grq.lock);
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-	do_raw_spin_unlock(&grq.lock);
+	__release(rq->lock);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&rq->lock);
 	sched_preempt_enable_no_resched();
 
 	schedule();
@@ -4862,14 +5002,26 @@ int __sched yield_to(struct task_struct
 	unsigned long flags;
 	int yielded = 0;
 
+	local_irq_save(flags);
 	rq = this_rq();
-	grq_lock_irqsave(&flags);
+
+again:
+	p_rq = task_rq(p);
+	/*
+	 * If we're the only runnable task on the rq and target rq also
+	 * has only one task, there's absolutely no point in yielding.
+	 */
 	if (task_running(p) || p->state) {
 		yielded = -ESRCH;
-		goto out_unlock;
+		goto out_irq;
+	}
+
+	double_rq_lock(rq, p_rq);
+	if (task_rq(p) != p_rq) {
+		double_rq_unlock(rq, p_rq);
+		goto again;
 	}
 
-	p_rq = task_rq(p);
 	yielded = 1;
 	if (p->deadline > rq->rq_deadline)
 		p->deadline = rq->rq_deadline;
@@ -4878,9 +5030,10 @@ int __sched yield_to(struct task_struct
 	if (p->time_slice > timeslice())
 		p->time_slice = timeslice();
 	if (preempt && rq != p_rq)
-		resched_curr(p_rq);
-out_unlock:
-	grq_unlock_irqrestore(&flags);
+		resched_task(p_rq->curr);
+	double_rq_unlock(rq, p_rq);
+out_irq:
+	local_irq_restore(flags);
 
 	if (yielded > 0)
 		schedule();
@@ -4986,8 +5139,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p
 	struct task_struct *p;
 	unsigned int time_slice;
 	unsigned long flags;
-	int retval;
 	struct timespec t;
+	struct rq *rq;
+	int retval;
 
 	if (pid < 0)
 		return -EINVAL;
@@ -5002,9 +5156,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	rq = task_rq_lock(p, &flags);
 	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
-	grq_unlock_irqrestore(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	rcu_read_unlock();
 	t = ns_to_timespec(time_slice);
@@ -5104,7 +5258,21 @@ void set_cpus_allowed_common(struct task
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&p->pi_lock);
+
 	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+
+	if (task_queued(p)) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+	}
+	if (needs_other_cpu(p, task_cpu(p)))
+		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
 }
 #endif
 
@@ -5122,7 +5290,7 @@ void init_idle(struct task_struct *idle,
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
-	time_lock_grq(rq);
+	raw_spin_lock(&rq->lock);
 	idle->last_ran = rq->clock_task;
 	idle->state = TASK_RUNNING;
 	/* Setting prio to illegal value shouldn't matter when never queued */
@@ -5151,7 +5319,7 @@ void init_idle(struct task_struct *idle,
 
 	rq->curr = rq->idle = idle;
 	idle->on_cpu = 1;
-	grq_unlock();
+	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
@@ -5237,11 +5405,12 @@ void wake_up_q(struct wake_q_head *head)
 
 void resched_cpu(int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 	resched_task(cpu_curr(cpu));
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 }
 
 #ifdef CONFIG_SMP
@@ -5368,12 +5537,13 @@ static int __set_cpus_allowed_ptr(struct
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	bool running_wrong = false;
+	struct cpumask old_mask;
 	bool queued = false;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -5391,7 +5561,8 @@ static int __set_cpus_allowed_ptr(struct
 		goto out;
 	}
 
-	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+	cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+	if (cpumask_equal(&old_mask, new_mask))
 		goto out;
 
 	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -5424,13 +5595,18 @@ static int __set_cpus_allowed_ptr(struct
 			running_wrong = true;
 		} else
 			resched_task(p);
-	} else
-		set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
+	} else {
+		int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+		struct rq *dest_rq = cpu_rq(dest_cpu);
 
+		lock_second_rq(rq, dest_rq);
+		set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
+		rq_unlock(dest_rq);
+	}
 out:
-	if (queued)
+	if (queued && !cpumask_subset(new_mask, &old_mask))
 		try_preempt(p, rq);
-	task_grq_unlock(&flags);
+	task_rq_unlock(rq, p, &flags);
 
 	if (running_wrong)
 		preempt_schedule_common();
@@ -5447,8 +5623,11 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 static bool sched_smp_initialized __read_mostly;
 
 #ifdef CONFIG_HOTPLUG_CPU
-/* Run through task list and find tasks affined to the dead cpu, then remove
- * that cpu from the list, enable cpu0 and set the zerobound flag. */
+/*
+ * Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
+ * cpu 0 and src_cpu's runqueue locks.
+ */
 static void bind_zero(int src_cpu)
 {
 	struct task_struct *p, *t;
@@ -5463,6 +5642,11 @@ static void bind_zero(int src_cpu)
 			cpumask_set_cpu(0, tsk_cpus_allowed(p));
 			p->zerobound = true;
 			bound++;
+			if (task_cpu(p) == src_cpu) {
+				set_task_cpu(p, 0);
+				if (task_running(p))
+					resched_task(p);
+			}
 		}
 	} while_each_thread(t, p);
 
@@ -5876,7 +6060,7 @@ static void rq_attach_root(struct rq *rq
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 
 	if (rq->rd) {
 		old_rd = rq->rd;
@@ -5902,7 +6086,7 @@ static void rq_attach_root(struct rq *rq
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 
 	if (old_rd)
 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
@@ -6881,14 +7065,13 @@ int sched_cpu_activate(unsigned int cpu)
 	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
 	 *    domains.
 	 */
-	grq_lock_irqsave(&flags);
+	rq_lock_irqsave(rq, &flags);
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_online(rq);
 	}
 	unbind_zero(cpu);
-	grq.noc = num_online_cpus();
-	grq_unlock_irqrestore(&flags);
+	rq_unlock_irqrestore(rq, &flags);
 
 	return 0;
 }
@@ -6936,14 +7119,15 @@ int sched_cpu_dying(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	local_irq_save(flags);
+	double_rq_lock(rq, cpu_rq(0));
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
 	bind_zero(cpu);
-	grq.noc = num_online_cpus();
-	grq_unlock_irqrestore(&flags);
+	double_rq_unlock(rq, cpu_rq(0));
+	local_irq_restore(flags);
 
 	return 0;
 }
@@ -7000,8 +7184,8 @@ void __init sched_init_smp(void)
 #ifdef CONFIG_SCHED_SMT
 	bool smt_threads = false;
 #endif
-
 	cpumask_var_t non_isolated_cpus;
+	struct rq *rq;
 
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -7026,7 +7210,8 @@ void __init sched_init_smp(void)
 	free_cpumask_var(non_isolated_cpus);
 
 	mutex_lock(&sched_domains_mutex);
-	grq_lock_irq();
+	local_irq_disable();
+	lock_all_rqs();
 	/*
 	 * Set up the relative cache distance of each online cpu from each
 	 * other in a simple array for quick lookup. Locality is determined
@@ -7037,7 +7222,7 @@ void __init sched_init_smp(void)
 	 * nodes) are treated as very distant.
 	 */
 	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
+		rq = cpu_rq(cpu);
 
 		/* First check if this cpu is in the same node */
 		for_each_domain(cpu, sd) {
@@ -7076,6 +7261,17 @@ void __init sched_init_smp(void)
 		}
 #endif
 	}
+	for_each_possible_cpu(cpu) {
+		int total_cpus = 0, locality;
+
+		rq = cpu_rq(cpu);
+		for (locality = 0; locality <= 4; locality++) {
+			for_each_possible_cpu(other_cpu) {
+				if (rq->cpu_locality[other_cpu] == locality)
+					rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+			}
+		}
+	}
 #ifdef CONFIG_SMT_NICE
 	if (smt_threads) {
 		check_siblings = &check_smt_siblings;
@@ -7083,11 +7279,13 @@ void __init sched_init_smp(void)
 		smt_schedule = &smt_should_schedule;
 	}
 #endif
-	grq_unlock_irq();
+	unlock_all_rqs();
+	local_irq_enable();
 	mutex_unlock(&sched_domains_mutex);
 
 	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
+		rq = cpu_rq(cpu);
+
 		for_each_online_cpu(other_cpu) {
 			if (other_cpu <= cpu)
 				continue;
@@ -7145,21 +7343,18 @@ void __init sched_init(void)
 	for (i = 1 ; i < NICE_WIDTH ; i++)
 		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
 
-	raw_spin_lock_init(&grq.lock);
-	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
-	grq.niffies = 0;
-	grq.last_jiffy = jiffies;
+	atomic_set(&grq.nr_running, 0);
+	atomic_set(&grq.nr_uninterruptible, 0);
+	atomic64_set(&grq.nr_switches, 0);
 	raw_spin_lock_init(&grq.iso_lock);
 	grq.iso_ticks = 0;
 	grq.iso_refractory = false;
-	grq.noc = 1;
-	skiplist_init(&grq.node);
-	grq.sl = new_skiplist(&grq.node);
 	skiplist_node_init(&init_task.node);
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
-	grq.qnr = grq.idle_cpus = 0;
+	atomic_set(&grq.qnr, 0);
+	grq.idle_cpus = 0;
 	cpumask_clear(&grq.cpu_idle_map);
 #else
 	uprq = &per_cpu(runqueues, 0);
@@ -7174,12 +7369,16 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
-		rq->grq_lock = &grq.lock;
+		skiplist_init(&rq->node);
+		rq->sl = new_skiplist(&rq->node);
+		raw_spin_lock_init(&rq->lock);
+		rq->niffies = 0;
+		rq->last_jiffy = jiffies;
 		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
 			      rq->iowait_pc = rq->idle_pc = 0;
 		rq->dither = false;
+		set_rq_task(rq, &init_task);
 #ifdef CONFIG_SMP
-		rq->last_niffy = 0;
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->online = false;
@@ -7212,6 +7411,10 @@ void __init sched_init(void)
 			else
 				rq->cpu_locality[j] = 4;
 		}
+		rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+		rq->rq_order[0] = rq;
+		for (j = 1; j < cpu_ids; j++)
+			rq->rq_order[j] = cpu_rq(j);
 	}
 #endif
 
@@ -7315,7 +7518,6 @@ static inline void normalise_rt_tasks(vo
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
-	int queued;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7328,17 +7530,9 @@ static inline void normalise_rt_tasks(vo
 		if (!rt_task(p) && !iso_task(p))
 			continue;
 
-		rq = task_grq_lock(p, &flags);
-		queued = task_queued(p);
-		if (queued)
-			dequeue_task(p);
+		rq = task_rq_lock(p, &flags);
 		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
-		if (queued) {
-			enqueue_task(p, rq);
-			try_preempt(p, rq);
-		}
-
-		task_grq_unlock(&flags);
+		task_rq_unlock(rq, p, &flags);
 	}
 	read_unlock(&tasklist_lock);
 }
Index: linux-4.7-bfs504/include/linux/skip_lists.h
===================================================================
--- linux-4.7-bfs504.orig/include/linux/skip_lists.h	2016-09-23 08:59:12.374546442 +1000
+++ linux-4.7-bfs504/include/linux/skip_lists.h	2016-09-27 14:57:30.913307009 +1000
@@ -9,8 +9,8 @@ struct nodeStructure {
 	int level;	/* Levels in this structure */
 	keyType key;
 	valueType value;
-	skiplist_node *next[16];
-	skiplist_node *prev[16];
+	skiplist_node *next[8];
+	skiplist_node *prev[8];
 };
 
 typedef struct listStructure {
Index: linux-4.7-bfs504/kernel/sched/bfs_sched.h
===================================================================
--- linux-4.7-bfs504.orig/kernel/sched/bfs_sched.h	2016-09-23 08:59:12.373546408 +1000
+++ linux-4.7-bfs504/kernel/sched/bfs_sched.h	2016-10-02 14:13:06.749962225 +1100
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/cpuidle.h>
+#include <linux/skip_lists.h>
 #include <linux/stop_machine.h>
 
 #ifndef BFS_SCHED_H
@@ -13,8 +14,7 @@ struct rq {
 	struct task_struct *curr, *idle, *stop;
 	struct mm_struct *prev_mm;
 
-	/* Pointer to grq spinlock */
-	raw_spinlock_t *grq_lock;
+	raw_spinlock_t lock;
 
 	/* Stored data about rq->curr to work outside grq lock */
 	u64 rq_deadline;
@@ -22,8 +22,7 @@ struct rq {
 	int rq_time_slice;
 	u64 rq_last_ran;
 	int rq_prio;
-	bool rq_running; /* There is a task running */
-	int soft_affined; /* Running or queued tasks with this set as their rq */
+
 	u64 load_update; /* When we last updated load */
 	unsigned long load_avg; /* Rolling load average */
 #ifdef CONFIG_SMT_NICE
@@ -36,6 +35,8 @@ struct rq {
 		iowait_pc, idle_pc;
 	atomic_t nr_iowait;
 
+	skiplist_node node;
+	skiplist *sl;
 #ifdef CONFIG_SMP
 	int cpu;		/* cpu of this runqueue */
 	bool online;
@@ -43,6 +44,10 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;
 	int *cpu_locality; /* CPU relative cache distance */
+	struct rq **rq_order; /* RQs ordered by relative cache distance */
+
+	unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
+	u64 niffies; /* Last time this RQ updated rq clock */
 #ifdef CONFIG_SCHED_SMT
 	cpumask_t thread_mask;
 	bool (*siblings_idle)(struct rq *rq);
@@ -53,7 +58,6 @@ struct rq {
 	bool (*cache_idle)(struct rq *rq);
 	/* See if all cache siblings are idle */
 #endif /* CONFIG_SCHED_MC */
-	u64 last_niffy; /* Last time this RQ updated grq.niffies */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64 prev_irq_time;
@@ -118,13 +122,13 @@ static inline u64 __rq_clock_broken(stru
 
 static inline u64 rq_clock(struct rq *rq)
 {
-	lockdep_assert_held(rq->grq_lock);
+	lockdep_assert_held(&rq->lock);
 	return rq->clock;
 }
 
 static inline u64 rq_clock_task(struct rq *rq)
 {
-	lockdep_assert_held(rq->grq_lock);
+	lockdep_assert_held(&rq->lock);
 	return rq->clock_task;
 }
 
Index: linux-4.7-bfs504/kernel/skip_lists.c
===================================================================
--- linux-4.7-bfs504.orig/kernel/skip_lists.c	2016-09-23 08:59:12.374546442 +1000
+++ linux-4.7-bfs504/kernel/skip_lists.c	2016-09-30 06:35:43.081468631 +1000
@@ -33,7 +33,7 @@ occurs in O(log n) time.
 
 delnode(slnode, l, node): deletes any binding of key from the l based on the
 actual node value. This operation occurs in O(k) time where k is the
-number of levels of the node in question (max 16). The original delete
+number of levels of the node in question (max 8). The original delete
 function occurred in O(log n) time and involved a search.
 
 BFS Notes: In this implementation of skiplists, there are bidirectional
@@ -51,7 +51,7 @@ aid of prev<->next pointer manipulation
 #include <linux/slab.h>
 #include <linux/skip_lists.h>
 
-#define MaxNumberOfLevels 16
+#define MaxNumberOfLevels 8
 #define MaxLevel (MaxNumberOfLevels - 1)
 
 void skiplist_init(skiplist_node *slnode)
@@ -111,9 +111,7 @@ static inline unsigned int randomLevel(i
 {
 	unsigned int mask;
 
-	if (entries > 31)
-		mask = 0xF;
-	else if (entries > 15)
+	if (entries > 15)
 		mask = 0x7;
 	else if (entries > 7)
 		mask = 0x3;
@@ -139,6 +137,8 @@ void skiplist_insert(skiplist *l, skipli
 	} while (--k >= 0);
 
 	k = randomLevel(++l->entries, randseed);
+	if (k > MaxLevel)
+		k = MaxLevel;
 	if (k > l->level) {
 		k = ++l->level;
 		update[k] = l->header;
Index: linux-4.7-bfs504/include/linux/sched.h
===================================================================
--- linux-4.7-bfs504.orig/include/linux/sched.h	2016-09-23 08:59:12.367546205 +1000
+++ linux-4.7-bfs504/include/linux/sched.h	2016-10-01 10:20:37.000000000 +1000
@@ -1953,7 +1953,6 @@ extern int arch_task_struct_size __read_
 #endif
 
 #ifdef CONFIG_SCHED_BFS
-bool grunqueue_is_locked(void);
 void grq_unlock_wait(void);
 void cpu_scaling(int cpu);
 void cpu_nonscaling(int cpu);
@@ -1964,11 +1963,6 @@ static inline void tsk_cpus_current(stru
 {
 }
 
-static inline int runqueue_is_locked(int cpu)
-{
-	return grunqueue_is_locked();
-}
-
 void print_scheduler_version(void);
 
 static inline bool iso_task(struct task_struct *p)
@@ -1976,7 +1970,6 @@ static inline bool iso_task(struct task_
 	return (p->policy == SCHED_ISO);
 }
 #else /* CFS */
-extern int runqueue_is_locked(int cpu);
 static inline void cpu_scaling(int cpu)
 {
 }
Index: linux-4.7-bfs504/include/linux/init_task.h
===================================================================
--- linux-4.7-bfs504.orig/include/linux/init_task.h	2016-10-02 14:13:06.751962209 +1100
+++ linux-4.7-bfs504/include/linux/init_task.h	2016-10-02 14:13:06.749962225 +1100
@@ -204,7 +204,7 @@ extern struct task_group root_task_group
 	.restart_block = {						\
 		.fn = do_no_restart_syscall,				\
 	},								\
-	.time_slice	= HZ,						\
+	.time_slice	= 1000000,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\