Use upgradeable read/write locks in place of the grq spinlock.

Separate out paths which require read/write or are indeterminate and use
each subvariant where appropriate to decrease lock contention and improve
scalability through more parallel read sections of code. Where code paths
are not performance critical, just use wlocks variants.

-ck

---
 kernel/sched/bfs.c |  386 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 250 insertions(+), 136 deletions(-)

Index: linux-3.10-ck1/kernel/sched/bfs.c
===================================================================
--- linux-3.10-ck1.orig/kernel/sched/bfs.c	2013-07-11 22:15:25.000000000 +1000
+++ linux-3.10-ck1/kernel/sched/bfs.c	2013-07-11 22:30:59.077006521 +1000
@@ -175,7 +175,7 @@
  * struct.
  */
 struct global_rq {
-	raw_spinlock_t lock;
+	urwlock_t urw;
 	unsigned long nr_running;
 	unsigned long nr_uninterruptible;
 	unsigned long long nr_switches;
@@ -380,7 +380,7 @@
 
 /*
  * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
+ * clock is updated with the grq lock held, it is an opportunity to update the
  * niffies value. Any CPU can update it by adding how much its clock has
  * increased since it last updated niffies, minus any added niffies by other
  * CPUs.
@@ -441,10 +441,11 @@
 #endif
 
 /*
- * All common locking functions performed on grq.lock. rq->clock is local to
+ * All common locking functions performed on grq lock. rq->clock is local to
  * the CPU accessing it so it can be modified just with interrupts disabled
- * when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
+ * when we're not updating niffies. Some variables are redundant to the
+ * behaviour and purely there as a prompt to know why the lock was taken.
+ * Looking up task_rq must be done under rlock to be safe.
  */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 
@@ -461,90 +462,177 @@
 	return p->on_cpu;
 }
 
-static inline void grq_lock(void)
-	__acquires(grq.lock)
+static inline void grq_wlock(void)
 {
-	raw_spin_lock(&grq.lock);
+	urw_wlock(&grq.urw);
 }
 
-static inline void grq_unlock(void)
-	__releases(grq.lock)
+static inline void grq_wunlock(void)
 {
-	raw_spin_unlock(&grq.lock);
+	urw_wunlock(&grq.urw);
 }
 
-static inline void grq_lock_irq(void)
-	__acquires(grq.lock)
+static inline void grq_rlock(void)
 {
-	raw_spin_lock_irq(&grq.lock);
+	urw_rlock(&grq.urw);
 }
 
-static inline void time_lock_grq(struct rq *rq)
-	__acquires(grq.lock)
+static inline void grq_runlock(void)
 {
-	grq_lock();
+	urw_runlock(&grq.urw);
+}
+
+static inline void grq_ulock(void)
+{
+	urw_ulock(&grq.urw);
+}
+
+static inline void grq_uunlock(void)
+{
+	urw_uunlock(&grq.urw);
+}
+
+static inline void grq_upgrade(void)
+{
+	urw_upgrade(&grq.urw);
+}
+
+static inline void grq_udowngrade(void)
+{
+	urw_udowngrade(&grq.urw);
+}
+
+static inline void grq_wdowngrade(void)
+{
+	urw_wdowngrade(&grq.urw);
+}
+
+static inline void grq_wlock_irq(void)
+{
+	urw_wlock_irq(&grq.urw);
+}
+
+static inline void grq_ulock_irq(void)
+{
+	urw_ulock_irq(&grq.urw);
+}
+
+static inline void time_wlock_grq(struct rq *rq)
+{
+	grq_wlock();
 	update_clocks(rq);
 }
 
-static inline void grq_unlock_irq(void)
-	__releases(grq.lock)
+static inline void grq_wunlock_irq(void)
+{
+	urw_wunlock_irq(&grq.urw);
+}
+
+static inline void grq_runlock_irq(void)
 {
-	raw_spin_unlock_irq(&grq.lock);
+	urw_runlock_irq(&grq.urw);
 }
 
-static inline void grq_lock_irqsave(unsigned long *flags)
-	__acquires(grq.lock)
+static inline void grq_wlock_irqsave(unsigned long *flags)
 {
-	raw_spin_lock_irqsave(&grq.lock, *flags);
+	urw_wlock_irqsave(&grq.urw, flags);
 }
 
-static inline void grq_unlock_irqrestore(unsigned long *flags)
-	__releases(grq.lock)
+static inline void grq_ulock_irqsave(unsigned long *flags)
 {
-	raw_spin_unlock_irqrestore(&grq.lock, *flags);
+	urw_ulock_irqsave(&grq.urw, flags);
+}
+
+static inline void grq_rlock_irqsave(unsigned long *flags)
+{
+	urw_rlock_irqsave(&grq.urw, flags);
+}
+
+static inline void grq_wunlock_irqrestore(unsigned long *flags)
+{
+	urw_wunlock_irqrestore(&grq.urw, flags);
+}
+
+static inline void grq_uunlock_irqrestore(unsigned long *flags)
+{
+	urw_uunlock_irqrestore(&grq.urw, flags);
+}
+
+static inline void grq_runlock_irqrestore(unsigned long *flags)
+{
+	urw_runlock_irqrestore(&grq.urw, flags);
 }
 
 static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+*task_grq_wlock(struct task_struct *p, unsigned long *flags)
 {
-	grq_lock_irqsave(flags);
+	grq_wlock_irqsave(flags);
 	return task_rq(p);
 }
 
 static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+*task_grq_ulock(struct task_struct *p, unsigned long *flags)
 {
-	struct rq *rq = task_grq_lock(p, flags);
+	grq_ulock_irqsave(flags);
+	return task_rq(p);
+}
+
+static inline struct rq
+*task_grq_rlock(struct task_struct *p, unsigned long *flags)
+{
+	grq_rlock_irqsave(flags);
+	return task_rq(p);
+}
+
+static inline struct rq
+*time_task_grq_wlock(struct task_struct *p, unsigned long *flags)
+{
+	struct rq *rq = task_grq_wlock(p, flags);
 	update_clocks(rq);
 	return rq;
 }
 
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+static inline struct rq *task_grq_wlock_irq(struct task_struct *p)
+{
+	grq_wlock_irq();
+	return task_rq(p);
+}
+
+static inline struct rq *task_grq_ulock_irq(struct task_struct *p)
 {
-	grq_lock_irq();
+	grq_ulock_irq();
 	return task_rq(p);
 }
 
-static inline void time_task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+static inline void time_task_grq_wlock_irq(struct task_struct *p)
 {
-	struct rq *rq = task_grq_lock_irq(p);
+	struct rq *rq = task_grq_wlock_irq(p);
 	update_clocks(rq);
 }
 
-static inline void task_grq_unlock_irq(void)
-	__releases(grq.lock)
+static inline void task_grq_wunlock_irq(void)
 {
-	grq_unlock_irq();
+	grq_wunlock_irq();
 }
 
-static inline void task_grq_unlock(unsigned long *flags)
-	__releases(grq.lock)
+static inline void task_grq_runlock_irq(void)
 {
-	grq_unlock_irqrestore(flags);
+	grq_runlock_irq();
+}
+
+static inline void task_grq_wunlock(unsigned long *flags)
+{
+	grq_wunlock_irqrestore(flags);
+}
+
+static inline void task_grq_uunlock(unsigned long *flags)
+{
+	grq_uunlock_irqrestore(flags);
+}
+
+static inline void task_grq_runlock(unsigned long *flags)
+{
+	grq_runlock_irqrestore(flags);
 }
 
 /**
@@ -556,34 +644,41 @@
  */
 bool grunqueue_is_locked(void)
 {
-	return raw_spin_is_locked(&grq.lock);
+	return raw_spin_is_locked(&grq.urw.lock);
 }
 
 void grq_unlock_wait(void)
-	__releases(grq.lock)
 {
 	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-	raw_spin_unlock_wait(&grq.lock);
+	raw_spin_unlock_wait(&grq.urw.lock);
 }
 
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
-	__acquires(grq.lock)
+static inline void time_grq_wlock(struct rq *rq, unsigned long *flags)
 {
 	local_irq_save(*flags);
-	time_lock_grq(rq);
+	time_wlock_grq(rq);
 }
 
-static inline struct rq *__task_grq_lock(struct task_struct *p)
-	__acquires(grq.lock)
+static inline struct rq *__task_grq_wlock(struct task_struct *p)
 {
-	grq_lock();
+	grq_wlock();
 	return task_rq(p);
 }
 
-static inline void __task_grq_unlock(void)
-	__releases(grq.lock)
+static inline struct rq *__task_grq_ulock(struct task_struct *p)
 {
-	grq_unlock();
+	grq_ulock();
+	return task_rq(p);
+}
+
+static inline void __task_grq_wunlock(void)
+{
+	grq_wunlock();
+}
+
+static inline void __task_grq_uunlock(void)
+{
+	grq_uunlock();
 }
 
 /*
@@ -616,16 +711,18 @@
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
-	grq.lock.owner = current;
+	grq.urw.lock.owner = current;
+	grq.urw.rwlock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
-	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+	spin_acquire(&grq.urw.lock.dep_map, 0, 0, _THIS_IP_);
+	rwlock_acquire(&grq.urw.rwlock.dep_map, 0, 0, _THIS_IP_);
 
-	grq_unlock_irq();
+	grq_wunlock_irq();
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -633,9 +730,9 @@
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-	grq_unlock_irq();
+	grq_wunlock_irq();
 #else
-	grq_unlock();
+	grq_wunlock();
 #endif
 }
 
@@ -1061,14 +1158,14 @@
 	/*
 	 * The caller should hold grq lock.
 	 */
-	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.urw.lock));
 #endif
 	trace_sched_migrate_task(p, cpu);
 	if (task_cpu(p) != cpu)
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 
 	/*
-	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+	 * After ->cpu is set up to a new value, task_grq_wlock(p, ...) can be
 	 * successfully executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
@@ -1196,7 +1293,7 @@
 {
 	int cpu;
 
-	assert_raw_spin_locked(&grq.lock);
+	assert_raw_spin_locked(&grq.urw.lock);
 
 	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
@@ -1216,7 +1313,7 @@
 #else
 static inline void resched_task(struct task_struct *p)
 {
-	assert_raw_spin_locked(&grq.lock);
+	assert_raw_spin_locked(&grq.urw.lock);
 	set_tsk_need_resched(p);
 }
 #endif
@@ -1297,14 +1394,14 @@
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
-		rq = task_grq_lock(p, &flags);
+		rq = task_grq_rlock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(p);
 		on_rq = task_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_grq_unlock(&flags);
+		task_grq_runlock(&flags);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -1605,7 +1702,7 @@
 	 * No need to do time_lock_grq as we only need to update the rq clock
 	 * if we activate the task
 	 */
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 	cpu = task_cpu(p);
 
 	/* state is a volatile long, どうして、分からない */
@@ -1615,13 +1712,17 @@
 	if (task_queued(p) || task_running(p))
 		goto out_running;
 
+	grq_upgrade();
 	ttwu_activate(p, rq, wake_flags & WF_SYNC);
 	success = true;
 
 out_running:
 	ttwu_post_activation(p, rq, success);
 out_unlock:
-	task_grq_unlock(&flags);
+	if (success)
+		task_grq_wunlock(&flags);
+	else
+		task_grq_uunlock(&flags);
 
 	ttwu_stat(p, cpu, wake_flags);
 
@@ -1643,7 +1744,7 @@
 	struct rq *rq = task_rq(p);
 	bool success = false;
 
-	lockdep_assert_held(&grq.lock);
+	lockdep_assert_held(&grq.urw.lock);
 
 	if (!(p->state & TASK_NORMAL))
 		return;
@@ -1759,7 +1860,7 @@
 	struct rq *rq;
 
 	parent = p->parent;
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_wlock(p, &flags);
 
 	/*
 	 * Reinit new task deadline as its creator deadline could have changed
@@ -1820,7 +1921,7 @@
 		}
 		time_slice_expired(p);
 	}
-	task_grq_unlock(&flags);
+	task_grq_wunlock(&flags);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1919,7 +2020,6 @@
  * details.)
  */
 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
-	__releases(grq.lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
@@ -1962,7 +2062,6 @@
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
-	__releases(grq.lock)
 {
 	struct rq *rq = this_rq();
 
@@ -2014,7 +2113,7 @@
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+	spin_release(&grq.urw.lock.dep_map, 1, _THIS_IP_);
 #endif
 
 	/* Here we just switch the register state and the stack. */
@@ -2613,31 +2712,35 @@
  * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
  *
- * Called with task_grq_lock() held.
+ * Called with task_grq_ulock() held. Returns with a downgraded rlock.
  */
 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
 	u64 ns = 0;
 
 	if (p == rq->curr) {
+		grq_upgrade();
 		update_clocks(rq);
+		grq_wdowngrade();
 		ns = rq->clock_task - rq->rq_last_ran;
 		if (unlikely((s64)ns < 0))
 			ns = 0;
-	}
+	} else
+		grq_udowngrade();
 
 	return ns;
 }
 
+/* Note the intentional unbalanced locking, lock ulock and unlock rlock. */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_grq_runlock(&flags);
 
 	return ns;
 }
@@ -2655,9 +2758,9 @@
 	struct rq *rq;
 	u64 ns;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 	ns = p->sched_time + do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_grq_runlock(&flags);
 
 	return ns;
 }
@@ -2665,7 +2768,7 @@
 /*
  * Return accounted runtime for the task.
  * Return separately the current's pending runtime that have not been
- * accounted yet.
+ * accounted yet. Note unbalanced locks as do_task_delta_exec downgrades lock.
  */
 unsigned long long task_sched_runtime_nodelta(struct task_struct *p, unsigned long long *delta)
 {
@@ -2673,10 +2776,10 @@
 	struct rq *rq;
 	u64 ns;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 	ns = p->sched_time;
 	*delta = do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_grq_runlock(&flags);
 
 	return ns;
 }
@@ -2941,10 +3044,10 @@
 
 	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
 	p = rq->curr;
-	grq_lock();
+	grq_wlock();
 	requeue_task(p);
 	set_tsk_need_resched(p);
-	grq_unlock();
+	grq_wunlock();
 }
 
 /*
@@ -3340,7 +3443,7 @@
 	deactivate = false;
 	schedule_debug(prev);
 
-	grq_lock_irq();
+	grq_wlock_irq();
 
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3375,7 +3478,7 @@
 	 * sure to submit it to avoid deadlocks.
 	 */
 	if (unlikely(deactivate && blk_needs_flush_plug(prev))) {
-		grq_unlock_irq();
+		grq_wunlock_irq();
 		preempt_enable_no_resched();
 		blk_schedule_flush_plug(prev);
 		goto need_resched;
@@ -3411,7 +3514,7 @@
 				* again.
 				*/
 				set_rq_task(rq, prev);
-				grq_unlock_irq();
+				grq_wunlock_irq();
 				goto rerun_prev_unlocked;
 			} else
 				swap_sticky(rq, cpu, prev);
@@ -3461,7 +3564,7 @@
 		rq = cpu_rq(cpu);
 		idle = rq->idle;
 	} else
-		grq_unlock_irq();
+		grq_wunlock_irq();
 
 rerun_prev_unlocked:
 	sched_preempt_enable_no_resched();
@@ -4043,7 +4146,7 @@
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -4060,12 +4163,14 @@
 	if (unlikely(p == rq->idle)) {
 		WARN_ON(p != rq->curr);
 		WARN_ON(p->pi_blocked_on);
-		goto out_unlock;
+		task_grq_uunlock(&flags);
+		return;
 	}
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	queued = task_queued(p);
+	grq_upgrade();
 	if (queued)
 		dequeue_task(p);
 	p->prio = prio;
@@ -4076,8 +4181,7 @@
 		try_preempt(p, rq);
 	}
 
-out_unlock:
-	task_grq_unlock(&flags);
+	task_grq_wunlock(&flags);
 }
 
 #endif
@@ -4104,7 +4208,7 @@
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = time_task_grq_lock(p, &flags);
+	rq = time_task_grq_wlock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -4134,7 +4238,7 @@
 			resched_task(p);
 	}
 out_unlock:
-	task_grq_unlock(&flags);
+	task_grq_wunlock(&flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -4416,13 +4520,13 @@
 	 * To be able to change p->policy safely, the grunqueue lock must be
 	 * held.
 	 */
-	rq = __task_grq_lock(p);
+	rq = __task_grq_ulock(p);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
-		__task_grq_unlock();
+		__task_grq_uunlock();
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		return -EINVAL;
 	}
@@ -4433,7 +4537,7 @@
 	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
 
-		__task_grq_unlock();
+		__task_grq_uunlock();
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		return 0;
 	}
@@ -4441,10 +4545,11 @@
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		__task_grq_unlock();
+		__task_grq_uunlock();
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
+	grq_upgrade();
 	update_clocks(rq);
 	p->sched_reset_on_fork = reset_on_fork;
 
@@ -4456,7 +4561,7 @@
 		enqueue_task(p);
 		try_preempt(p, rq);
 	}
-	__task_grq_unlock();
+	__task_grq_wunlock();
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 	rt_mutex_adjust_pi(p);
@@ -4736,9 +4841,9 @@
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	grq_rlock_irqsave(&flags);
 	cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask);
-	grq_unlock_irqrestore(&flags);
+	grq_runlock_irqrestore(&flags);
 
 out_unlock:
 	rcu_read_unlock();
@@ -4793,7 +4898,7 @@
 	struct task_struct *p;
 
 	p = current;
-	grq_lock_irq();
+	grq_wlock_irq();
 	schedstat_inc(task_rq(p), yld_count);
 	requeue_task(p);
 
@@ -4801,9 +4906,10 @@
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(grq.lock);
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-	do_raw_spin_unlock(&grq.lock);
+	__urw_write_unlock(&grq.urw.rwlock);
+	__release(grq.urw.lock);
+	spin_release(&grq.urw.lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&grq.urw.lock);
 	sched_preempt_enable_no_resched();
 
 	schedule();
@@ -4921,17 +5027,18 @@
  */
 bool __sched yield_to(struct task_struct *p, bool preempt)
 {
+	bool yielded = false;
 	unsigned long flags;
-	int yielded = 0;
 	struct rq *rq;
 
 	rq = this_rq();
-	grq_lock_irqsave(&flags);
+	grq_ulock_irqsave(&flags);
 	if (task_running(p) || p->state) {
 		yielded = -ESRCH;
 		goto out_unlock;
 	}
-	yielded = 1;
+	yielded = true;
+	grq_upgrade();
 	if (p->deadline > rq->rq_deadline)
 		p->deadline = rq->rq_deadline;
 	p->time_slice += rq->rq_time_slice;
@@ -4940,10 +5047,11 @@
 		p->time_slice = timeslice();
 	set_tsk_need_resched(rq->curr);
 out_unlock:
-	grq_unlock_irqrestore(&flags);
-
-	if (yielded > 0)
+	if (yielded) {
+		grq_wunlock_irqrestore(&flags);
 		schedule();
+	} else
+		grq_uunlock_irqrestore(&flags);
 	return yielded;
 }
 EXPORT_SYMBOL_GPL(yield_to);
@@ -5068,9 +5176,9 @@
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	grq_rlock_irqsave(&flags);
 	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
-	grq_unlock_irqrestore(&flags);
+	grq_runlock_irqrestore(&flags);
 
 	rcu_read_unlock();
 	t = ns_to_timespec(time_slice);
@@ -5176,7 +5284,7 @@
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	time_grq_lock(rq, &flags);
+	time_grq_wlock(rq, &flags);
 	idle->last_ran = rq->clock_task;
 	idle->state = TASK_RUNNING;
 	/* Setting prio to illegal value shouldn't matter when never queued */
@@ -5189,7 +5297,7 @@
 	rcu_read_unlock();
 	rq->curr = rq->idle = idle;
 	idle->on_cpu = 1;
-	grq_unlock_irqrestore(&flags);
+	grq_wunlock_irqrestore(&flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	task_thread_info(idle)->preempt_count = 0;
@@ -5252,9 +5360,9 @@
 {
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	grq_wlock_irqsave(&flags);
 	resched_task(cpu_curr(cpu));
-	grq_unlock_irqrestore(&flags);
+	grq_wunlock_irqrestore(&flags);
 }
 
 /*
@@ -5346,12 +5454,13 @@
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	bool running_wrong = false;
+	bool upgrade = false;
 	bool queued = false;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 
 	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
 		goto out;
@@ -5363,6 +5472,8 @@
 
 	queued = task_queued(p);
 
+	upgrade = true;
+	grq_upgrade();
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
@@ -5380,9 +5491,12 @@
 		set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
 
 out:
-	if (queued)
-		try_preempt(p, rq);
-	task_grq_unlock(&flags);
+	if (upgrade) {
+		if (queued)
+			try_preempt(p, rq);
+		task_grq_wunlock(&flags);
+	} else
+		task_grq_uunlock(&flags);
 
 	if (running_wrong)
 		_cond_resched();
@@ -5665,39 +5779,39 @@
 
 	case CPU_ONLINE:
 		/* Update our root-domain */
-		grq_lock_irqsave(&flags);
+		grq_wlock_irqsave(&flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
 			set_rq_online(rq);
 		}
 		grq.noc = num_online_cpus();
-		grq_unlock_irqrestore(&flags);
+		grq_wunlock_irqrestore(&flags);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 		/* Idle task back to normal (off runqueue, low prio) */
-		grq_lock_irq();
+		grq_wlock_irq();
 		return_task(idle, true);
 		idle->static_prio = MAX_PRIO;
 		__setscheduler(idle, rq, SCHED_NORMAL, 0);
 		idle->prio = PRIO_LIMIT;
 		set_rq_task(rq, idle);
 		update_clocks(rq);
-		grq_unlock_irq();
+		grq_wunlock_irq();
 		break;
 
 	case CPU_DYING:
 		/* Update our root-domain */
-		grq_lock_irqsave(&flags);
+		grq_wlock_irqsave(&flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
 		break_sole_affinity(cpu, idle);
 		grq.noc = num_online_cpus();
-		grq_unlock_irqrestore(&flags);
+		grq_wunlock_irqrestore(&flags);
 		break;
 #endif
 	}
@@ -5893,7 +6007,7 @@
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	grq_wlock_irqsave(&flags);
 
 	if (rq->rd) {
 		old_rd = rq->rd;
@@ -5919,7 +6033,7 @@
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 
-	grq_unlock_irqrestore(&flags);
+	grq_wunlock_irqrestore(&flags);
 
 	if (old_rd)
 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
@@ -6925,7 +7039,7 @@
 		BUG();
 	free_cpumask_var(non_isolated_cpus);
 
-	grq_lock_irq();
+	grq_wlock_irq();
 	/*
 	 * Set up the relative cache distance of each online cpu from each
 	 * other in a simple array for quick lookup. Locality is determined
@@ -6983,7 +7097,7 @@
 			rq->cache_idle = cache_cpu_idle;
 #endif
 	}
-	grq_unlock_irq();
+	grq_wunlock_irq();
 }
 #else
 void __init sched_init_smp(void)
@@ -7009,7 +7123,7 @@
 	for (i = 1 ; i < PRIO_RANGE ; i++)
 		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
 
-	raw_spin_lock_init(&grq.lock);
+	urwlock_init(&grq.urw);
 	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
 	grq.niffies = 0;
 	grq.last_jiffy = jiffies;
@@ -7160,7 +7274,7 @@
 			continue;
 
 		raw_spin_lock(&p->pi_lock);
-		rq = __task_grq_lock(p);
+		rq = __task_grq_wlock(p);
 
 		queued = task_queued(p);
 		if (queued)
@@ -7171,7 +7285,7 @@
 			try_preempt(p, rq);
 		}
 
-		__task_grq_unlock();
+		__task_grq_wunlock();
 		raw_spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);