Convert BFS to use skip lists.

-ck

---
 include/linux/init_task.h |    2 
 include/linux/sched.h     |    3 
 kernel/sched/bfs.c        |  174 ++++++++++++++++++++--------------------------
 3 files changed, 79 insertions(+), 100 deletions(-)

Index: linux-4.7-ck5/include/linux/init_task.h
===================================================================
--- linux-4.7-ck5.orig/include/linux/init_task.h	2016-09-23 08:32:56.588747273 +1000
+++ linux-4.7-ck5/include/linux/init_task.h	2016-09-23 08:32:56.585747292 +1000
@@ -204,7 +204,7 @@ extern struct task_group root_task_group
 	.restart_block = {						\
 		.fn = do_no_restart_syscall,				\
 	},								\
-	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.node		= NULL,						\
 	.time_slice	= HZ,					\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
Index: linux-4.7-ck5/include/linux/sched.h
===================================================================
--- linux-4.7-ck5.orig/include/linux/sched.h	2016-09-23 08:32:56.588747273 +1000
+++ linux-4.7-ck5/include/linux/sched.h	2016-09-23 08:32:56.585747292 +1000
@@ -59,6 +59,7 @@ struct sched_param {
 #include <linux/gfp.h>
 #include <linux/magic.h>
 #include <linux/cgroup-defs.h>
+#include <linux/skip_lists.h>
 
 #include <asm/processor.h>
 
@@ -1477,7 +1478,7 @@ struct task_struct {
 #ifdef CONFIG_SCHED_BFS
 	int time_slice;
 	u64 deadline;
-	struct list_head run_list;
+	skiplist_node *node; /* Skip list node id */
 	u64 last_ran;
 	u64 sched_time; /* sched_clock time spent running */
 #ifdef CONFIG_SMT_NICE
Index: linux-4.7-ck5/kernel/sched/bfs.c
===================================================================
--- linux-4.7-ck5.orig/kernel/sched/bfs.c	2016-09-23 08:32:56.588747273 +1000
+++ linux-4.7-ck5/kernel/sched/bfs.c	2016-09-23 08:32:56.586747285 +1000
@@ -74,6 +74,7 @@
 #include <linux/context_tracking.h>
 #include <linux/sched/prio.h>
 #include <linux/tick.h>
+#include <linux/skip_lists.h>
 
 #include <asm/irq_regs.h>
 #include <asm/switch_to.h>
@@ -182,8 +183,6 @@ struct global_rq {
 	unsigned long nr_running;
 	unsigned long nr_uninterruptible;
 	unsigned long long nr_switches;
-	struct list_head queue[PRIO_LIMIT];
-	DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
 	unsigned long qnr; /* queued not running */
 #ifdef CONFIG_SMP
 	cpumask_t cpu_idle_map;
@@ -196,6 +195,9 @@ struct global_rq {
 	raw_spinlock_t iso_lock;
 	int iso_ticks;
 	bool iso_refractory;
+
+	skiplist_node *node;
+	skiplist *sl;
 };
 
 #ifdef CONFIG_SMP
@@ -530,24 +532,25 @@ static inline bool deadline_after(u64 de
 }
 
 /*
- * A task that is queued but not running will be on the grq run list.
- * A task that is not running or queued will not be on the grq run list.
- * A task that is currently running will have ->on_cpu set but not on the
- * grq run list.
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
  */
 static inline bool task_queued(struct task_struct *p)
 {
-	return (!list_empty(&p->run_list));
+	return p->node;
 }
 
 /*
- * Removing from the global runqueue. Enter with grq locked.
+ * Removing from the global runqueue. Enter with grq locked. Deleting a task
+ * from the skip list is done via the stored node reference in the task struct
+ * and does not require a full look up. Thus it occurs in O(k) time where k
+ * is the "level" of the list the task was stored at - usually < 4, max 16.
  */
 static void dequeue_task(struct task_struct *p)
 {
-	list_del_init(&p->run_list);
-	if (list_empty(grq.queue + p->prio))
-		__clear_bit(p->prio, grq.prio_bitmap);
+	skiplist_delnode(grq.node, grq.sl, p->node);
+	p->node = NULL;
 	sched_info_dequeued(task_rq(p), p);
 }
 
@@ -575,6 +578,8 @@ static bool isoprio_suitable(void)
  */
 static void enqueue_task(struct task_struct *p, struct rq *rq)
 {
+	u64 sl_id;
+
 	if (!rt_task(p)) {
 		/* Check it hasn't gotten rt from PI */
 		if ((idleprio_task(p) && idleprio_suitable(p)) ||
@@ -583,8 +588,26 @@ static void enqueue_task(struct task_str
 		else
 			p->prio = NORMAL_PRIO;
 	}
-	__set_bit(p->prio, grq.prio_bitmap);
-	list_add_tail(&p->run_list, grq.queue + p->prio);
+	/*
+	 * The sl_id key passed to the skiplist generates a sorted list.
+	 * Realtime and sched iso tasks run FIFO so they only need be sorted
+	 * according to priority. The skiplist will put tasks of the same
+	 * key inserted later in FIFO order. Tasks of sched normal, batch
+	 * and idleprio are sorted according to their deadlines. Idleprio
+	 * tasks are offset by an impossibly large deadline value ensuring
+	 * they get sorted into last positions, but still according to their
+	 * own deadlines. This creates a "landscape" of skiplists running
+	 * from priority 0 realtime in first place to the lowest priority
+	 * idleprio tasks last. Skiplist insertion is an O(log n) process.
+	 */
+	if (p->prio <= ISO_PRIO)
+		sl_id = p->prio;
+	else {
+		sl_id = p->deadline;
+		if (p->prio == IDLE_PRIO)
+			sl_id |= 0xF000000000000000;
+	}
+	p->node = skiplist_insert(grq.node, grq.sl, sl_id, p, grq.niffies);
 	sched_info_queued(rq, p);
 }
 
@@ -1715,7 +1738,7 @@ int sched_fork(unsigned long __maybe_unu
 		p->sched_reset_on_fork = 0;
 	}
 
-	INIT_LIST_HEAD(&p->run_list);
+	p->node = NULL;
 #ifdef CONFIG_SCHED_INFO
 	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -3272,101 +3295,58 @@ found_middle:
 }
 
 /*
- * O(n) lookup of all tasks in the global runqueue. The real brainfuck
- * of lock contention and O(n). It's not really O(n) as only the queued,
- * but not running tasks are scanned, and is O(n) queued in the worst case
- * scenario only because the right task can be found before scanning all of
- * them.
- * Tasks are selected in this order:
- * Real time tasks are selected purely by their static priority and in the
- * order they were queued, so the lowest value idx, and the first queued task
- * of that priority value is chosen.
- * If no real time tasks are found, the SCHED_ISO priority is checked, and
- * all SCHED_ISO tasks have the same priority value, so they're selected by
- * the earliest deadline value.
- * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
- * earliest deadline.
- * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
- * selected by the earliest deadline.
+ * Task selection with skiplists is a simple matter of picking off the first
+ * task in the sorted list, an O(1) operation. The only time it takes longer
+ * is if tasks do not have suitable affinity and then we iterate over entries
+ * till we find the first that does. Worst case here is no tasks with suitable
+ * affinity and taking O(n).
  */
 static inline struct
 task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
 {
-	struct task_struct *edt = NULL;
-	unsigned long idx = -1;
+	struct task_struct *edt = idle;
+	skiplist_node *node = grq.node;
+	u64 earliest_deadline = ~0ULL;
+
+	while ((node = node->next[0]) != grq.node) {
+		struct task_struct *p = node->value;
+		int tcpu;
 
-	do {
-		struct list_head *queue;
-		struct task_struct *p;
-		u64 earliest_deadline;
+		/* Make sure affinity is ok */
+		if (needs_other_cpu(p, cpu))
+			continue;
 
-		idx = next_sched_bit(grq.prio_bitmap, ++idx);
-		if (idx >= PRIO_LIMIT)
-			return idle;
-		queue = grq.queue + idx;
-
-		if (idx < MAX_RT_PRIO) {
-			/* We found an rt task */
-			list_for_each_entry(p, queue, run_list) {
-				/* Make sure cpu affinity is ok */
-				if (needs_other_cpu(p, cpu))
-					continue;
-				edt = p;
-				goto out_take;
-			}
-			/*
-			 * None of the RT tasks at this priority can run on
-			 * this cpu
-			 */
+#ifdef CONFIG_SMT_NICE
+		if (!smt_should_schedule(p, cpu))
 			continue;
-		}
+#endif
 
-		/*
-		 * No rt tasks. Find the earliest deadline task. Now we're in
-		 * O(n) territory.
-		 */
-		earliest_deadline = ~0ULL;
-		list_for_each_entry(p, queue, run_list) {
+		if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
 			u64 dl;
 
-			/* Make sure cpu affinity is ok */
-			if (needs_other_cpu(p, cpu))
+			if (task_sticky(p) && scaling_rq(rq))
 				continue;
-
-#ifdef CONFIG_SMT_NICE
-			if (!smt_should_schedule(p, cpu))
+			dl = p->deadline << locality_diff(tcpu, rq);
+			if (unlikely(!deadline_before(dl, earliest_deadline)))
 				continue;
-#endif
-			/*
-			 * Soft affinity happens here by not scheduling a task
-			 * with its sticky flag set that ran on a different CPU
-			 * last when the CPU is scaling, or by greatly biasing
-			 * against its deadline when not, based on cpu cache
-			 * locality.
-			 */
-			if (sched_interactive)
-				dl = p->deadline;
-			else {
-				int tcpu = task_cpu(p);
-
-				if (tcpu != cpu && task_sticky(p) && scaling_rq(rq))
-					continue;
-				dl = p->deadline << locality_diff(tcpu, rq);
-			}
-
-			if (deadline_before(dl, earliest_deadline)) {
-				earliest_deadline = dl;
-				edt = p;
-			}
+			earliest_deadline = dl;
+			edt = p;
+			/* We continue even though we've found the earliest
+			 * deadline task as the locality offset means there
+			 * may be a better candidate after it. */
+			continue;
 		}
-	} while (!edt);
-
-out_take:
-	take_task(cpu, edt);
+		/* This wouldn't happen if we encountered a better deadline from
+		 * another CPU and have already set edt. */
+		if (likely(p->deadline < earliest_deadline))
+			edt = p;
+		break;
+	}
+	if (likely(edt != idle))
+		take_task(cpu, edt);
 	return edt;
 }
 
-
 /*
  * Print scheduling while atomic bug:
  */
@@ -7257,6 +7237,9 @@ void __init sched_init(void)
 	grq.iso_ticks = 0;
 	grq.iso_refractory = false;
 	grq.noc = 1;
+	grq.node = skiplist_init();
+	grq.sl = new_skiplist(grq.node);
+
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 	grq.qnr = grq.idle_cpus = 0;
@@ -7308,11 +7291,6 @@ void __init sched_init(void)
 	}
 #endif
 
-	for (i = 0; i < PRIO_LIMIT; i++)
-		INIT_LIST_HEAD(grq.queue + i);
-	/* delimiter for bitsearch */
-	__set_bit(PRIO_LIMIT, grq.prio_bitmap);
-
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif