Convert BFS to use skip lists. -ck --- include/linux/init_task.h | 2 include/linux/sched.h | 3 kernel/sched/bfs.c | 174 ++++++++++++++++++++-------------------------- 3 files changed, 79 insertions(+), 100 deletions(-) Index: linux-4.7-ck5/include/linux/init_task.h =================================================================== --- linux-4.7-ck5.orig/include/linux/init_task.h 2016-09-23 08:32:56.588747273 +1000 +++ linux-4.7-ck5/include/linux/init_task.h 2016-09-23 08:32:56.585747292 +1000 @@ -204,7 +204,7 @@ extern struct task_group root_task_group .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ - .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .node = NULL, \ .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ INIT_PUSHABLE_TASKS(tsk) \ Index: linux-4.7-ck5/include/linux/sched.h =================================================================== --- linux-4.7-ck5.orig/include/linux/sched.h 2016-09-23 08:32:56.588747273 +1000 +++ linux-4.7-ck5/include/linux/sched.h 2016-09-23 08:32:56.585747292 +1000 @@ -59,6 +59,7 @@ struct sched_param { #include #include #include +#include #include @@ -1477,7 +1478,7 @@ struct task_struct { #ifdef CONFIG_SCHED_BFS int time_slice; u64 deadline; - struct list_head run_list; + skiplist_node *node; /* Skip list node id */ u64 last_ran; u64 sched_time; /* sched_clock time spent running */ #ifdef CONFIG_SMT_NICE Index: linux-4.7-ck5/kernel/sched/bfs.c =================================================================== --- linux-4.7-ck5.orig/kernel/sched/bfs.c 2016-09-23 08:32:56.588747273 +1000 +++ linux-4.7-ck5/kernel/sched/bfs.c 2016-09-23 08:32:56.586747285 +1000 @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -182,8 +183,6 @@ struct global_rq { unsigned long nr_running; unsigned long nr_uninterruptible; unsigned long long nr_switches; - struct list_head queue[PRIO_LIMIT]; - DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); unsigned long qnr; /* queued not running */ #ifdef CONFIG_SMP cpumask_t cpu_idle_map; @@ -196,6 +195,9 @@ struct global_rq { raw_spinlock_t iso_lock; int iso_ticks; bool iso_refractory; + + skiplist_node *node; + skiplist *sl; }; #ifdef CONFIG_SMP @@ -530,24 +532,25 @@ static inline bool deadline_after(u64 de } /* - * A task that is queued but not running will be on the grq run list. - * A task that is not running or queued will not be on the grq run list. - * A task that is currently running will have ->on_cpu set but not on the - * grq run list. + * A task that is not running or queued will not have a node set. + * A task that is queued but not running will have a node set. + * A task that is currently running will have ->on_cpu set but no node set. */ static inline bool task_queued(struct task_struct *p) { - return (!list_empty(&p->run_list)); + return p->node; } /* - * Removing from the global runqueue. Enter with grq locked. + * Removing from the global runqueue. Enter with grq locked. Deleting a task + * from the skip list is done via the stored node reference in the task struct + * and does not require a full look up. Thus it occurs in O(k) time where k + * is the "level" of the list the task was stored at - usually < 4, max 16. */ static void dequeue_task(struct task_struct *p) { - list_del_init(&p->run_list); - if (list_empty(grq.queue + p->prio)) - __clear_bit(p->prio, grq.prio_bitmap); + skiplist_delnode(grq.node, grq.sl, p->node); + p->node = NULL; sched_info_dequeued(task_rq(p), p); } @@ -575,6 +578,8 @@ static bool isoprio_suitable(void) */ static void enqueue_task(struct task_struct *p, struct rq *rq) { + u64 sl_id; + if (!rt_task(p)) { /* Check it hasn't gotten rt from PI */ if ((idleprio_task(p) && idleprio_suitable(p)) || @@ -583,8 +588,26 @@ static void enqueue_task(struct task_str else p->prio = NORMAL_PRIO; } - __set_bit(p->prio, grq.prio_bitmap); - list_add_tail(&p->run_list, grq.queue + p->prio); + /* + * The sl_id key passed to the skiplist generates a sorted list. + * Realtime and sched iso tasks run FIFO so they only need be sorted + * according to priority. The skiplist will put tasks of the same + * key inserted later in FIFO order. Tasks of sched normal, batch + * and idleprio are sorted according to their deadlines. Idleprio + * tasks are offset by an impossibly large deadline value ensuring + * they get sorted into last positions, but still according to their + * own deadlines. This creates a "landscape" of skiplists running + * from priority 0 realtime in first place to the lowest priority + * idleprio tasks last. Skiplist insertion is an O(log n) process. + */ + if (p->prio <= ISO_PRIO) + sl_id = p->prio; + else { + sl_id = p->deadline; + if (p->prio == IDLE_PRIO) + sl_id |= 0xF000000000000000; + } + p->node = skiplist_insert(grq.node, grq.sl, sl_id, p, grq.niffies); sched_info_queued(rq, p); } @@ -1715,7 +1738,7 @@ int sched_fork(unsigned long __maybe_unu p->sched_reset_on_fork = 0; } - INIT_LIST_HEAD(&p->run_list); + p->node = NULL; #ifdef CONFIG_SCHED_INFO if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -3272,101 +3295,58 @@ found_middle: } /* - * O(n) lookup of all tasks in the global runqueue. The real brainfuck - * of lock contention and O(n). It's not really O(n) as only the queued, - * but not running tasks are scanned, and is O(n) queued in the worst case - * scenario only because the right task can be found before scanning all of - * them. - * Tasks are selected in this order: - * Real time tasks are selected purely by their static priority and in the - * order they were queued, so the lowest value idx, and the first queued task - * of that priority value is chosen. - * If no real time tasks are found, the SCHED_ISO priority is checked, and - * all SCHED_ISO tasks have the same priority value, so they're selected by - * the earliest deadline value. - * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the - * earliest deadline. - * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are - * selected by the earliest deadline. + * Task selection with skiplists is a simple matter of picking off the first + * task in the sorted list, an O(1) operation. The only time it takes longer + * is if tasks do not have suitable affinity and then we iterate over entries + * till we find the first that does. Worst case here is no tasks with suitable + * affinity and taking O(n). */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { - struct task_struct *edt = NULL; - unsigned long idx = -1; + struct task_struct *edt = idle; + skiplist_node *node = grq.node; + u64 earliest_deadline = ~0ULL; + + while ((node = node->next[0]) != grq.node) { + struct task_struct *p = node->value; + int tcpu; - do { - struct list_head *queue; - struct task_struct *p; - u64 earliest_deadline; + /* Make sure affinity is ok */ + if (needs_other_cpu(p, cpu)) + continue; - idx = next_sched_bit(grq.prio_bitmap, ++idx); - if (idx >= PRIO_LIMIT) - return idle; - queue = grq.queue + idx; - - if (idx < MAX_RT_PRIO) { - /* We found an rt task */ - list_for_each_entry(p, queue, run_list) { - /* Make sure cpu affinity is ok */ - if (needs_other_cpu(p, cpu)) - continue; - edt = p; - goto out_take; - } - /* - * None of the RT tasks at this priority can run on - * this cpu - */ +#ifdef CONFIG_SMT_NICE + if (!smt_should_schedule(p, cpu)) continue; - } +#endif - /* - * No rt tasks. Find the earliest deadline task. Now we're in - * O(n) territory. - */ - earliest_deadline = ~0ULL; - list_for_each_entry(p, queue, run_list) { + if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) { u64 dl; - /* Make sure cpu affinity is ok */ - if (needs_other_cpu(p, cpu)) + if (task_sticky(p) && scaling_rq(rq)) continue; - -#ifdef CONFIG_SMT_NICE - if (!smt_should_schedule(p, cpu)) + dl = p->deadline << locality_diff(tcpu, rq); + if (unlikely(!deadline_before(dl, earliest_deadline))) continue; -#endif - /* - * Soft affinity happens here by not scheduling a task - * with its sticky flag set that ran on a different CPU - * last when the CPU is scaling, or by greatly biasing - * against its deadline when not, based on cpu cache - * locality. - */ - if (sched_interactive) - dl = p->deadline; - else { - int tcpu = task_cpu(p); - - if (tcpu != cpu && task_sticky(p) && scaling_rq(rq)) - continue; - dl = p->deadline << locality_diff(tcpu, rq); - } - - if (deadline_before(dl, earliest_deadline)) { - earliest_deadline = dl; - edt = p; - } + earliest_deadline = dl; + edt = p; + /* We continue even though we've found the earliest + * deadline task as the locality offset means there + * may be a better candidate after it. */ + continue; } - } while (!edt); - -out_take: - take_task(cpu, edt); + /* This wouldn't happen if we encountered a better deadline from + * another CPU and have already set edt. */ + if (likely(p->deadline < earliest_deadline)) + edt = p; + break; + } + if (likely(edt != idle)) + take_task(cpu, edt); return edt; } - /* * Print scheduling while atomic bug: */ @@ -7257,6 +7237,9 @@ void __init sched_init(void) grq.iso_ticks = 0; grq.iso_refractory = false; grq.noc = 1; + grq.node = skiplist_init(); + grq.sl = new_skiplist(grq.node); + #ifdef CONFIG_SMP init_defrootdomain(); grq.qnr = grq.idle_cpus = 0; @@ -7308,11 +7291,6 @@ void __init sched_init(void) } #endif - for (i = 0; i < PRIO_LIMIT; i++) - INIT_LIST_HEAD(grq.queue + i); - /* delimiter for bitsearch */ - __set_bit(PRIO_LIMIT, grq.prio_bitmap); - #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif