Use upgradeable read/write locks in place of the grq spinlock.

Separate out paths which require read/write or are indeterminate and use
each subvariant where appropriate to decrease lock contention and improve
scalability through more parallel read sections of code. Where code paths
are not performance critical, just use wlocks variants.

-ck

---
 kernel/sched/bfs.c |  394 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 258 insertions(+), 136 deletions(-)

Index: linux-3.5-ck1/kernel/sched/bfs.c
===================================================================
--- linux-3.5-ck1.orig/kernel/sched/bfs.c	2012-08-16 13:27:10.367258721 +1000
+++ linux-3.5-ck1/kernel/sched/bfs.c	2012-08-16 13:36:18.235866665 +1000
@@ -69,6 +69,7 @@
 #include <linux/slab.h>
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
+#include <linux/urwlock.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -173,7 +174,7 @@ static inline int timeslice(void)
  * struct.
  */
 struct global_rq {
-	raw_spinlock_t lock;
+	urwlock_t urw;
 	unsigned long nr_running;
 	unsigned long nr_uninterruptible;
 	unsigned long long nr_switches;
@@ -386,7 +387,7 @@ static inline int cpu_of(struct rq *rq)
 
 /*
  * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
+ * clock is updated with the grq lock held, it is an opportunity to update the
  * niffies value. Any CPU can update it by adding how much its clock has
  * increased since it last updated niffies, minus any added niffies by other
  * CPUs.
@@ -447,10 +448,11 @@ static inline void update_clocks(struct
 #endif
 
 /*
- * All common locking functions performed on grq.lock. rq->clock is local to
+ * All common locking functions performed on grq lock. rq->clock is local to
  * the CPU accessing it so it can be modified just with interrupts disabled
- * when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
+ * when we're not updating niffies. Some variables are redundant to the
+ * behaviour and purely there as a prompt to know why the lock was taken.
+ * Looking up task_rq must be done under rlock to be safe.
  */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 
@@ -467,90 +469,177 @@ static inline bool task_running(struct t
 	return p->on_cpu;
 }
 
-static inline void grq_lock(void)
-	__acquires(grq.lock)
+static inline void grq_wlock(void)
 {
-	raw_spin_lock(&grq.lock);
+	urw_wlock(&grq.urw);
 }
 
-static inline void grq_unlock(void)
-	__releases(grq.lock)
+static inline void grq_wunlock(void)
 {
-	raw_spin_unlock(&grq.lock);
+	urw_wunlock(&grq.urw);
 }
 
-static inline void grq_lock_irq(void)
-	__acquires(grq.lock)
+static inline void grq_rlock(void)
 {
-	raw_spin_lock_irq(&grq.lock);
+	urw_rlock(&grq.urw);
 }
 
-static inline void time_lock_grq(struct rq *rq)
-	__acquires(grq.lock)
+static inline void grq_runlock(void)
 {
-	grq_lock();
+	urw_runlock(&grq.urw);
+}
+
+static inline void grq_ulock(void)
+{
+	urw_ulock(&grq.urw);
+}
+
+static inline void grq_uunlock(void)
+{
+	urw_uunlock(&grq.urw);
+}
+
+static inline void grq_upgrade(void)
+{
+	urw_upgrade(&grq.urw);
+}
+
+static inline void grq_udowngrade(void)
+{
+	urw_udowngrade(&grq.urw);
+}
+
+static inline void grq_wdowngrade(void)
+{
+	urw_wdowngrade(&grq.urw);
+}
+
+static inline void grq_wlock_irq(void)
+{
+	urw_wlock_irq(&grq.urw);
+}
+
+static inline void grq_ulock_irq(void)
+{
+	urw_ulock_irq(&grq.urw);
+}
+
+static inline void time_wlock_grq(struct rq *rq)
+{
+	grq_wlock();
 	update_clocks(rq);
 }
 
-static inline void grq_unlock_irq(void)
-	__releases(grq.lock)
+static inline void grq_wunlock_irq(void)
+{
+	urw_wunlock_irq(&grq.urw);
+}
+
+static inline void grq_runlock_irq(void)
+{
+	urw_runlock_irq(&grq.urw);
+}
+
+static inline void grq_wlock_irqsave(unsigned long *flags)
+{
+	urw_wlock_irqsave(&grq.urw, flags);
+}
+
+static inline void grq_ulock_irqsave(unsigned long *flags)
+{
+	urw_ulock_irqsave(&grq.urw, flags);
+}
+
+static inline void grq_rlock_irqsave(unsigned long *flags)
+{
+	urw_rlock_irqsave(&grq.urw, flags);
+}
+
+static inline void grq_wunlock_irqrestore(unsigned long *flags)
+{
+	urw_wunlock_irqrestore(&grq.urw, flags);
+}
+
+static inline void grq_uunlock_irqrestore(unsigned long *flags)
 {
-	raw_spin_unlock_irq(&grq.lock);
+	urw_uunlock_irqrestore(&grq.urw, flags);
 }
 
-static inline void grq_lock_irqsave(unsigned long *flags)
-	__acquires(grq.lock)
+static inline void grq_runlock_irqrestore(unsigned long *flags)
 {
-	raw_spin_lock_irqsave(&grq.lock, *flags);
+	urw_runlock_irqrestore(&grq.urw, flags);
 }
 
-static inline void grq_unlock_irqrestore(unsigned long *flags)
-	__releases(grq.lock)
+static inline struct rq
+*task_grq_wlock(struct task_struct *p, unsigned long *flags)
 {
-	raw_spin_unlock_irqrestore(&grq.lock, *flags);
+	grq_wlock_irqsave(flags);
+	return task_rq(p);
 }
 
 static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+*task_grq_ulock(struct task_struct *p, unsigned long *flags)
 {
-	grq_lock_irqsave(flags);
+	grq_ulock_irqsave(flags);
 	return task_rq(p);
 }
 
 static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+*task_grq_rlock(struct task_struct *p, unsigned long *flags)
 {
-	struct rq *rq = task_grq_lock(p, flags);
+	grq_rlock_irqsave(flags);
+	return task_rq(p);
+}
+
+static inline struct rq
+*time_task_grq_wlock(struct task_struct *p, unsigned long *flags)
+{
+	struct rq *rq = task_grq_wlock(p, flags);
 	update_clocks(rq);
 	return rq;
 }
 
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+static inline struct rq *task_grq_wlock_irq(struct task_struct *p)
+{
+	grq_wlock_irq();
+	return task_rq(p);
+}
+
+static inline struct rq *task_grq_ulock_irq(struct task_struct *p)
 {
-	grq_lock_irq();
+	grq_ulock_irq();
 	return task_rq(p);
 }
 
-static inline void time_task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
+static inline void time_task_grq_wlock_irq(struct task_struct *p)
 {
-	struct rq *rq = task_grq_lock_irq(p);
+	struct rq *rq = task_grq_wlock_irq(p);
 	update_clocks(rq);
 }
 
-static inline void task_grq_unlock_irq(void)
-	__releases(grq.lock)
+static inline void task_grq_wunlock_irq(void)
 {
-	grq_unlock_irq();
+	grq_wunlock_irq();
 }
 
-static inline void task_grq_unlock(unsigned long *flags)
-	__releases(grq.lock)
+static inline void task_grq_runlock_irq(void)
 {
-	grq_unlock_irqrestore(flags);
+	grq_runlock_irq();
+}
+
+static inline void task_grq_wunlock(unsigned long *flags)
+{
+	grq_wunlock_irqrestore(flags);
+}
+
+static inline void task_grq_uunlock(unsigned long *flags)
+{
+	grq_uunlock_irqrestore(flags);
+}
+
+static inline void task_grq_runlock(unsigned long *flags)
+{
+	grq_runlock_irqrestore(flags);
 }
 
 /**
@@ -562,34 +651,41 @@ static inline void task_grq_unlock(unsig
  */
 bool grunqueue_is_locked(void)
 {
-	return raw_spin_is_locked(&grq.lock);
+	return raw_spin_is_locked(&grq.urw.lock);
 }
 
 void grq_unlock_wait(void)
-	__releases(grq.lock)
 {
 	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-	raw_spin_unlock_wait(&grq.lock);
+	raw_spin_unlock_wait(&grq.urw.lock);
 }
 
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
-	__acquires(grq.lock)
+static inline void time_grq_wlock(struct rq *rq, unsigned long *flags)
 {
 	local_irq_save(*flags);
-	time_lock_grq(rq);
+	time_wlock_grq(rq);
+}
+
+static inline struct rq *__task_grq_wlock(struct task_struct *p)
+{
+	grq_wlock();
+	return task_rq(p);
 }
 
-static inline struct rq *__task_grq_lock(struct task_struct *p)
-	__acquires(grq.lock)
+static inline struct rq *__task_grq_ulock(struct task_struct *p)
 {
-	grq_lock();
+	grq_ulock();
 	return task_rq(p);
 }
 
-static inline void __task_grq_unlock(void)
-	__releases(grq.lock)
+static inline void __task_grq_wunlock(void)
+{
+	grq_wunlock();
+}
+
+static inline void __task_grq_uunlock(void)
 {
-	grq_unlock();
+	grq_uunlock();
 }
 
 /*
@@ -622,16 +718,18 @@ static inline void finish_lock_switch(st
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
-	grq.lock.owner = current;
+	grq.urw.lock.owner = current;
+	grq.urw.rwlock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
-	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+	spin_acquire(&grq.urw.lock.dep_map, 0, 0, _THIS_IP_);
+	rwlock_acquire(&grq.urw.rwlock.dep_map, 0, 0, _THIS_IP_);
 
-	grq_unlock_irq();
+	grq_wunlock_irq();
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -639,9 +737,9 @@ static inline void finish_lock_switch(st
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-	grq_unlock_irq();
+	grq_wunlock_irq();
 #else
-	grq_unlock();
+	grq_wunlock();
 #endif
 }
 
@@ -1060,14 +1158,14 @@ void set_task_cpu(struct task_struct *p,
 	/*
 	 * The caller should hold grq lock.
 	 */
-	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.urw.lock));
 #endif
 	trace_sched_migrate_task(p, cpu);
 	if (task_cpu(p) != cpu)
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 
 	/*
-	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+	 * After ->cpu is set up to a new value, task_grq_wlock(p, ...) can be
 	 * successfully executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
@@ -1195,7 +1293,7 @@ static void resched_task(struct task_str
 {
 	int cpu;
 
-	assert_raw_spin_locked(&grq.lock);
+	assert_raw_spin_locked(&grq.urw.lock);
 
 	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
@@ -1215,7 +1313,7 @@ static void resched_task(struct task_str
 #else
 static inline void resched_task(struct task_struct *p)
 {
-	assert_raw_spin_locked(&grq.lock);
+	assert_raw_spin_locked(&grq.urw.lock);
 	set_tsk_need_resched(p);
 }
 #endif
@@ -1296,14 +1394,14 @@ retry_rq:
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
-		rq = task_grq_lock(p, &flags);
+		rq = task_grq_rlock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(p);
 		on_rq = task_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_grq_unlock(&flags);
+		task_grq_runlock(&flags);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -1597,7 +1695,7 @@ static bool try_to_wake_up(struct task_s
 	 * No need to do time_lock_grq as we only need to update the rq clock
 	 * if we activate the task
 	 */
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 	cpu = task_cpu(p);
 
 	/* state is a volatile long, どうして、分からない */
@@ -1607,13 +1705,17 @@ static bool try_to_wake_up(struct task_s
 	if (task_queued(p) || task_running(p))
 		goto out_running;
 
+	grq_upgrade();
 	ttwu_activate(p, rq, wake_flags & WF_SYNC);
 	success = true;
 
 out_running:
 	ttwu_post_activation(p, rq, success);
 out_unlock:
-	task_grq_unlock(&flags);
+	if (success)
+		task_grq_wunlock(&flags);
+	else
+		task_grq_uunlock(&flags);
 
 	ttwu_stat(p, cpu, wake_flags);
 
@@ -1635,7 +1737,7 @@ static void try_to_wake_up_local(struct
 	struct rq *rq = task_rq(p);
 	bool success = false;
 
-	lockdep_assert_held(&grq.lock);
+	lockdep_assert_held(&grq.urw.lock);
 
 	if (!(p->state & TASK_NORMAL))
 		return;
@@ -1757,10 +1859,13 @@ void sched_fork(struct task_struct *p)
 	 * value. rq->rq_deadline is only modified within schedule() so it
 	 * is always equal to current->deadline.
 	 */
-	rq = task_grq_lock_irq(curr);
+	rq = task_grq_ulock_irq(curr);
 	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+		grq_udowngrade();
 		rq->rq_time_slice /= 2;
 		p->time_slice = rq->rq_time_slice;
+		p->last_ran = rq->rq_last_ran;
+		task_grq_runlock_irq();
 	} else {
 		/*
 		 * Forking task has run out of timeslice. Reschedule it and
@@ -1769,11 +1874,12 @@ void sched_fork(struct task_struct *p)
 		 * be slightly earlier.
 		 */
 		rq->rq_time_slice = 0;
+		grq_upgrade();
 		set_tsk_need_resched(curr);
 		time_slice_expired(p);
+		p->last_ran = rq->rq_last_ran;
+		task_grq_wunlock_irq();
 	}
-	p->last_ran = rq->rq_last_ran;
-	task_grq_unlock_irq();
 out:
 	put_cpu();
 }
@@ -1791,11 +1897,11 @@ void wake_up_new_task(struct task_struct
 	unsigned long flags;
 	struct rq *rq;
 
-	rq = task_grq_lock(p, &flags);
 	p->state = TASK_RUNNING;
 	parent = p->parent;
 	/* Unnecessary but small chance that the parent changed CPU */
 	set_task_cpu(p, task_cpu(parent));
+	rq = task_grq_wlock(p, &flags);
 	activate_task(p, rq);
 	trace_sched_wakeup_new(p, 1);
 	if (rq->curr == parent && !suitable_idle_cpus(p)) {
@@ -1807,7 +1913,7 @@ void wake_up_new_task(struct task_struct
 		resched_task(parent);
 	} else
 		try_preempt(p, rq);
-	task_grq_unlock(&flags);
+	task_grq_wunlock(&flags);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1908,7 +2014,6 @@ prepare_task_switch(struct rq *rq, struc
  * details.)
  */
 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
-	__releases(grq.lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
@@ -1956,7 +2061,6 @@ static inline void finish_task_switch(st
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
-	__releases(grq.lock)
 {
 	struct rq *rq = this_rq();
 
@@ -2008,7 +2112,7 @@ context_switch(struct rq *rq, struct tas
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+	spin_release(&grq.urw.lock.dep_map, 1, _THIS_IP_);
 #endif
 
 	/* Here we just switch the register state and the stack. */
@@ -2535,31 +2639,35 @@ ts_account:
  * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
  *
- * Called with task_grq_lock() held.
+ * Called with task_grq_ulock() held. Returns with a downgraded rlock.
  */
 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
 	u64 ns = 0;
 
 	if (p == rq->curr) {
+		grq_upgrade();
 		update_clocks(rq);
+		grq_wdowngrade();
 		ns = rq->clock_task - rq->rq_last_ran;
 		if (unlikely((s64)ns < 0))
 			ns = 0;
-	}
+	} else
+		grq_udowngrade();
 
 	return ns;
 }
 
+/* Note the intentional unbalanced locking, lock ulock and unlock rlock. */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_grq_runlock(&flags);
 
 	return ns;
 }
@@ -2568,6 +2676,8 @@ unsigned long long task_delta_exec(struc
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
  * pending runtime that have not been accounted yet.
+ *
+ * Note the intentional unbalanced locking, lock ulock and unlock rlock.
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
@@ -2575,9 +2685,9 @@ unsigned long long task_sched_runtime(st
 	struct rq *rq;
 	u64 ns;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 	ns = p->sched_time + do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_grq_runlock(&flags);
 
 	return ns;
 }
@@ -2842,10 +2952,10 @@ static void task_running_tick(struct rq
 
 	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
 	p = rq->curr;
-	grq_lock();
+	grq_wlock();
 	requeue_task(p);
 	set_tsk_need_resched(p);
-	grq_unlock();
+	grq_wunlock();
 }
 
 void wake_up_idle_cpu(int cpu);
@@ -3208,7 +3318,7 @@ need_resched:
 	deactivate = false;
 	schedule_debug(prev);
 
-	grq_lock_irq();
+	grq_wlock_irq();
 
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3243,7 +3353,7 @@ need_resched:
 	 * sure to submit it to avoid deadlocks.
 	 */
 	if (unlikely(deactivate && blk_needs_flush_plug(prev))) {
-		grq_unlock_irq();
+		grq_wunlock_irq();
 		preempt_enable_no_resched();
 		blk_schedule_flush_plug(prev);
 		goto need_resched;
@@ -3278,7 +3388,7 @@ need_resched:
 				* again.
 				*/
 				set_rq_task(rq, prev);
-				grq_unlock_irq();
+				grq_wunlock_irq();
 				goto rerun_prev_unlocked;
 			} else
 				swap_sticky(rq, cpu, prev);
@@ -3327,7 +3437,7 @@ need_resched:
 		rq = cpu_rq(cpu);
 		idle = rq->idle;
 	} else
-		grq_unlock_irq();
+		grq_wunlock_irq();
 
 rerun_prev_unlocked:
 	sched_preempt_enable_no_resched();
@@ -3884,7 +3994,7 @@ void rt_mutex_setprio(struct task_struct
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3901,12 +4011,14 @@ void rt_mutex_setprio(struct task_struct
 	if (unlikely(p == rq->idle)) {
 		WARN_ON(p != rq->curr);
 		WARN_ON(p->pi_blocked_on);
-		goto out_unlock;
+		task_grq_uunlock(&flags);
+		return;
 	}
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	queued = task_queued(p);
+	grq_upgrade();
 	if (queued)
 		dequeue_task(p);
 	p->prio = prio;
@@ -3917,8 +4029,7 @@ void rt_mutex_setprio(struct task_struct
 		try_preempt(p, rq);
 	}
 
-out_unlock:
-	task_grq_unlock(&flags);
+	task_grq_wunlock(&flags);
 }
 
 #endif
@@ -3945,7 +4056,7 @@ void set_user_nice(struct task_struct *p
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = time_task_grq_lock(p, &flags);
+	rq = time_task_grq_wlock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -3975,7 +4086,7 @@ void set_user_nice(struct task_struct *p
 			resched_task(p);
 	}
 out_unlock:
-	task_grq_unlock(&flags);
+	task_grq_wunlock(&flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -4257,13 +4368,13 @@ recheck:
 	 * To be able to change p->policy safely, the grunqueue lock must be
 	 * held.
 	 */
-	rq = __task_grq_lock(p);
+	rq = __task_grq_ulock(p);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
-		__task_grq_unlock();
+		__task_grq_uunlock();
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		return -EINVAL;
 	}
@@ -4274,7 +4385,7 @@ recheck:
 	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
 
-		__task_grq_unlock();
+		__task_grq_uunlock();
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		return 0;
 	}
@@ -4282,10 +4393,11 @@ recheck:
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		__task_grq_unlock();
+		__task_grq_uunlock();
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
+	grq_upgrade();
 	update_clocks(rq);
 	p->sched_reset_on_fork = reset_on_fork;
 
@@ -4297,7 +4409,7 @@ recheck:
 		enqueue_task(p);
 		try_preempt(p, rq);
 	}
-	__task_grq_unlock();
+	__task_grq_wunlock();
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 	rt_mutex_adjust_pi(p);
@@ -4567,9 +4679,9 @@ long sched_getaffinity(pid_t pid, cpumas
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	grq_rlock_irqsave(&flags);
 	cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask);
-	grq_unlock_irqrestore(&flags);
+	grq_runlock_irqrestore(&flags);
 
 out_unlock:
 	rcu_read_unlock();
@@ -4624,7 +4736,7 @@ SYSCALL_DEFINE0(sched_yield)
 	struct task_struct *p;
 
 	p = current;
-	grq_lock_irq();
+	grq_wlock_irq();
 	schedstat_inc(task_rq(p), yld_count);
 	requeue_task(p);
 
@@ -4632,9 +4744,10 @@ SYSCALL_DEFINE0(sched_yield)
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(grq.lock);
-	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-	do_raw_spin_unlock(&grq.lock);
+	__urw_write_unlock(&grq.urw.rwlock);
+	__release(grq.urw.lock);
+	spin_release(&grq.urw.lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&grq.urw.lock);
 	sched_preempt_enable_no_resched();
 
 	schedule();
@@ -4753,15 +4866,16 @@ EXPORT_SYMBOL(yield);
  */
 bool __sched yield_to(struct task_struct *p, bool preempt)
 {
+	bool yielded = false;
 	unsigned long flags;
-	bool yielded = 0;
 	struct rq *rq;
 
 	rq = this_rq();
-	grq_lock_irqsave(&flags);
+	grq_ulock_irqsave(&flags);
 	if (task_running(p) || p->state)
 		goto out_unlock;
-	yielded = 1;
+	yielded = true;
+	grq_upgrade();
 	if (p->deadline > rq->rq_deadline)
 		p->deadline = rq->rq_deadline;
 	p->time_slice += rq->rq_time_slice;
@@ -4770,10 +4884,12 @@ bool __sched yield_to(struct task_struct
 		p->time_slice = timeslice();
 	set_tsk_need_resched(rq->curr);
 out_unlock:
-	grq_unlock_irqrestore(&flags);
-
-	if (yielded)
+	if (yielded) {
+		grq_wunlock_irqrestore(&flags);
 		schedule();
+	} else
+		grq_uunlock_irqrestore(&flags);
+
 	return yielded;
 }
 EXPORT_SYMBOL_GPL(yield_to);
@@ -4898,9 +5014,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p
 	if (retval)
 		goto out_unlock;
 
-	grq_lock_irqsave(&flags);
+	grq_rlock_irqsave(&flags);
 	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
-	grq_unlock_irqrestore(&flags);
+	grq_runlock_irqrestore(&flags);
 
 	rcu_read_unlock();
 	t = ns_to_timespec(time_slice);
@@ -4995,7 +5111,7 @@ void init_idle(struct task_struct *idle,
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	time_grq_lock(rq, &flags);
+	time_grq_wlock(rq, &flags);
 	idle->last_ran = rq->clock;
 	idle->state = TASK_RUNNING;
 	/* Setting prio to illegal value shouldn't matter when never queued */
@@ -5008,7 +5124,7 @@ void init_idle(struct task_struct *idle,
 	rcu_read_unlock();
 	rq->curr = rq->idle = idle;
 	idle->on_cpu = 1;
-	grq_unlock_irqrestore(&flags);
+	grq_wunlock_irqrestore(&flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	task_thread_info(idle)->preempt_count = 0;
@@ -5067,9 +5183,9 @@ static inline void resched_cpu(int cpu)
 {
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	grq_wlock_irqsave(&flags);
 	resched_task(cpu_curr(cpu));
-	grq_unlock_irqrestore(&flags);
+	grq_wunlock_irqrestore(&flags);
 }
 
 /*
@@ -5157,12 +5273,13 @@ void wake_up_idle_cpu(int cpu)
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	bool running_wrong = false;
+	bool upgrade = false;
 	bool queued = false;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 
-	rq = task_grq_lock(p, &flags);
+	rq = task_grq_ulock(p, &flags);
 
 	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
 		goto out;
@@ -5179,6 +5296,8 @@ int set_cpus_allowed_ptr(struct task_str
 
 	queued = task_queued(p);
 
+	upgrade = true;
+	grq_upgrade();
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
@@ -5196,9 +5315,12 @@ int set_cpus_allowed_ptr(struct task_str
 		set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
 
 out:
-	if (queued)
-		try_preempt(p, rq);
-	task_grq_unlock(&flags);
+	if (upgrade) {
+		if (queued)
+			try_preempt(p, rq);
+		task_grq_wunlock(&flags);
+	} else
+		task_grq_uunlock(&flags);
 
 	if (running_wrong)
 		_cond_resched();
@@ -5495,32 +5617,32 @@ migration_call(struct notifier_block *nf
 
 	case CPU_ONLINE:
 		/* Update our root-domain */
-		grq_lock_irqsave(&flags);
+		grq_wlock_irqsave(&flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
 			set_rq_online(rq);
 		}
 		grq.noc = num_online_cpus();
-		grq_unlock_irqrestore(&flags);
+		grq_wunlock_irqrestore(&flags);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 		/* Idle task back to normal (off runqueue, low prio) */
-		grq_lock_irq();
+		grq_wlock_irq();
 		return_task(idle, true);
 		idle->static_prio = MAX_PRIO;
 		__setscheduler(idle, rq, SCHED_NORMAL, 0);
 		idle->prio = PRIO_LIMIT;
 		set_rq_task(rq, idle);
 		update_clocks(rq);
-		grq_unlock_irq();
+		grq_wunlock_irq();
 		break;
 
 	case CPU_DYING:
 		/* Update our root-domain */
-		grq_lock_irqsave(&flags);
+		grq_wlock_irqsave(&flags);
 		sched_idle_next(rq, cpu, idle);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5528,7 +5650,7 @@ migration_call(struct notifier_block *nf
 		}
 		break_sole_affinity(cpu, idle);
 		grq.noc = num_online_cpus();
-		grq_unlock_irqrestore(&flags);
+		grq_wunlock_irqrestore(&flags);
 		break;
 #endif
 	}
@@ -5796,7 +5918,7 @@ static void rq_attach_root(struct rq *rq
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 
-	grq_lock_irqsave(&flags);
+	grq_wlock_irqsave(&flags);
 
 	if (rq->rd) {
 		old_rd = rq->rd;
@@ -5822,7 +5944,7 @@ static void rq_attach_root(struct rq *rq
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 
-	grq_unlock_irqrestore(&flags);
+	grq_wunlock_irqrestore(&flags);
 
 	if (old_rd)
 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
@@ -7039,7 +7161,7 @@ void __init sched_init_smp(void)
 		BUG();
 	free_cpumask_var(non_isolated_cpus);
 
-	grq_lock_irq();
+	grq_wlock_irq();
 	/*
 	 * Set up the relative cache distance of each online cpu from each
 	 * other in a simple array for quick lookup. Locality is determined
@@ -7094,7 +7216,7 @@ void __init sched_init_smp(void)
 			rq->cache_idle = cache_cpu_idle;
 #endif
 	}
-	grq_unlock_irq();
+	grq_wunlock_irq();
 }
 #else
 void __init sched_init_smp(void)
@@ -7120,7 +7242,7 @@ void __init sched_init(void)
 	for (i = 1 ; i < PRIO_RANGE ; i++)
 		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
 
-	raw_spin_lock_init(&grq.lock);
+	urwlock_init(&grq.urw);
 	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
 	grq.niffies = 0;
 	grq.last_jiffy = jiffies;
@@ -7271,7 +7393,7 @@ void normalize_rt_tasks(void)
 			continue;
 
 		raw_spin_lock_irqsave(&p->pi_lock, flags);
-		rq = __task_grq_lock(p);
+		rq = __task_grq_wlock(p);
 
 		queued = task_queued(p);
 		if (queued)
@@ -7282,7 +7404,7 @@ void normalize_rt_tasks(void)
 			try_preempt(p, rq);
 		}
 
-		__task_grq_unlock();
+		__task_grq_wunlock();
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 	} while_each_thread(g, p);