diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt index 2bdcc98..c0841e2 100644 --- a/Documentation/scheduler/sched-MuQSS.txt +++ b/Documentation/scheduler/sched-MuQSS.txt @@ -4,4 +4,75 @@ See sched-BFS.txt for basic design; MuQSS is a per-cpu runqueue variant with one 8 level skiplist per runqueue, and fine grained locking for much more scalability. +Goals. + +The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from +here on (pronounced mux) is to completely do away with the complex designs of +the past for the cpu process scheduler and instead implement one that is very +simple in basic design. The main focus of MuQSS is to achieve excellent desktop +interactivity and responsiveness without heuristics and tuning knobs that are +difficult to understand, impossible to model and predict the effect of, and when +tuned to one workload cause massive detriment to another, while still being +scalable to many CPUs and processes. + + +Design summary. + +MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) +lookup, earliest effective virtual deadline first design, loosely based on EEVDF +(earliest eligible virtual deadline first) and my previous Staircase Deadline +scheduler, and evolved from the single runqueue O(n) BFS scheduler. Each +component shall be described in order to understand the significance of, and +reasoning for it. + + +Design reasoning. + +In BFS, the use of a single runqueue across all CPUs meant that each CPU would +need to scan the entire runqueue looking for the process with the earliest +deadline and schedule that next, regardless of which CPU it originally came +from. This made BFS deterministic with respect to latency and provided +guaranteed latencies dependent on number of processes and CPUs. The single +runqueue, however, meant that all CPUs would complete for the single lock +protecting it, which would lead to increasing lock contention as the number of +CPUs rose and appeared to limit scalability of common workloads beyond 16 +logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously +increased overhead proportionate to the number of queued proecesses and led to +cache thrashing while iterating over the linked list. + +MuQSS is an evolution of BFS, designed to maintain the same scheduling +decision mechanism and be virtually deterministic without relying on the +constrained design of the single runqueue by splitting out the single runqueue +to be per-CPU and use skiplists instead of linked lists. + +The original reason for going back to a single runqueue design for BFS was that +once multiple runqueues are introduced, per-CPU or otherwise, there will be +complex interactions as each runqueue will be responsible for the scheduling +latency and fairness of the tasks only on its own runqueue, and to achieve +fairness and low latency across multiple CPUs, any advantage in throughput of +having CPU local tasks causes other disadvantages. This is due to requiring a +very complex balancing system to at best achieve some semblance of fairness +across CPUs and can only maintain relatively low latency for tasks bound to the +same CPUs, not across them. To increase said fairness and latency across CPUs, +the advantage of local runqueue locking, which makes for better scalability, is +lost due to having to grab multiple locks. + +MuQSS works around the problems inherent in multiple runqueue designs by +making its skip lists priority ordered and through novel use of lockless +examination of each other runqueue it can decide if it should take the earliest +deadline task from another runqueue for latency reasons, or for CPU balancing +reasons. It still does not have a balancing system, choosing to allow the +next task scheduling decision and task wakeup CPU choice to allow balancing to +happen by virtue of its choices. + + +Design: + +MuQSS is an 8 level skip list per runqueue variant of BFS. + +See sched-BFS.txt for some of the shared design details. + +Documentation yet to be completed. + + Con Kolivas Sun, 2nd October 2016 diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c index 58d8c54..1700be5 100644 --- a/kernel/sched/MuQSS.c +++ b/kernel/sched/MuQSS.c @@ -98,7 +98,6 @@ #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) -#define rt_queue(rq) rt_prio((rq)->rq_prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) @@ -107,19 +106,15 @@ #define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) #define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) #define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy))) #define is_iso_policy(policy) ((policy) == SCHED_ISO) #define iso_task(p) unlikely(is_iso_policy((p)->policy)) -#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy)) #define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) #define ISO_PERIOD (5 * HZ) -#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) #define STOP_PRIO (MAX_RT_PRIO - 1) /* @@ -139,7 +134,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "MuQSS CPU scheduler v0.111 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.112 by Con Kolivas.\n"); } /* @@ -670,6 +665,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * remote lock we're migrating it to before enabling them. */ if (unlikely(task_on_rq_migrating(prev))) { + sched_info_dequeued(rq, prev); /* * We move the ownership of prev to the new cpu now. ttwu can't * activate prev to the wrong cpu since it has to grab this @@ -780,6 +776,7 @@ static void update_load_avg(struct rq *rq) static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { skiplist_delete(rq->sl, &p->node); + rq->best_key = rq->node.next[0]->key; update_clocks(rq); if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(task_rq(p), p); @@ -862,6 +859,7 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_queued(rq, p); randseed = (rq->niffies >> 10) & 0xFFFFFFFF; skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); + rq->best_key = rq->node.next[0]->key; update_load_avg(rq); } @@ -1036,6 +1034,8 @@ static void resched_curr(struct rq *rq) if (test_tsk_need_resched(rq->curr)) return; + rq->preempt = rq->curr; + /* We're doing this without holding the rq lock if it's not task_rq */ set_tsk_need_resched(rq->curr); @@ -1122,6 +1122,24 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return (this_rq->cpu_locality[that_cpu] < 3); } +/* As per resched_curr but only will resched idle task */ +static inline void resched_idle(struct rq *rq) +{ + if (test_tsk_need_resched(rq->idle)) + return; + + rq->preempt = rq->idle; + + set_tsk_need_resched(rq->idle); + + if (rq_local(rq)) { + set_preempt_need_resched(); + return; + } + + smp_send_reschedule(rq->cpu); +} + static struct rq *resched_best_idle(struct task_struct *p, int cpu) { cpumask_t tmpmask; @@ -1133,13 +1151,7 @@ static struct rq *resched_best_idle(struct task_struct *p, int cpu) rq = cpu_rq(best_cpu); if (!smt_schedule(p, rq)) return NULL; - /* - * Given we do this lockless, do one last check that the rq is still - * idle by the time we get here - */ - if (unlikely(!rq_idle(rq))) - return NULL; - resched_curr(rq); + resched_idle(rq); return rq; } @@ -1265,6 +1277,7 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq) p->on_rq = 0; atomic_dec(&grq.nr_running); + sched_info_dequeued(rq, p); } #ifdef CONFIG_SMP @@ -1284,7 +1297,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif - if (task_cpu(p) == cpu) + if (p->wake_cpu == cpu) return; trace_sched_migrate_task(p, cpu); perf_event_task_migrate(p); @@ -1296,7 +1309,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); - if (task_running(rq, p) && rq->online) { + if (task_running(rq, p)) { /* * We should only be calling this on a running task if we're * holding rq lock. @@ -1327,7 +1340,13 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) */ static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) { - dequeue_task(task_rq(p), p, 0); + struct rq *p_rq = task_rq(p); + + dequeue_task(p_rq, p, DEQUEUE_SAVE); + if (p_rq != rq) { + sched_info_dequeued(p_rq, p); + sched_info_queued(rq, p); + } set_task_cpu(p, cpu); dec_qnr(); } @@ -1354,7 +1373,7 @@ static inline void return_task(struct task_struct *p, struct rq *rq, p->on_rq = TASK_ON_RQ_MIGRATING; else #endif - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); } } @@ -1539,23 +1558,6 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu) return false; } #define cpu_online_map (*(cpumask_t *)cpu_online_mask) -#ifdef CONFIG_HOTPLUG_CPU -/* - * Check to see if there is a task that is affined only to offline CPUs but - * still wants runtime. This happens to kernel threads during suspend/halt and - * disabling of CPUs. - */ -static inline bool online_cpus(struct task_struct *p) -{ - return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed))); -} -#else /* CONFIG_HOTPLUG_CPU */ -/* All available CPUs are always online without hotplug. */ -static inline bool online_cpus(struct task_struct *p) -{ - return true; -} -#endif static void try_preempt(struct task_struct *p, struct rq *this_rq) { @@ -1741,6 +1743,12 @@ void scheduler_ipi(void) * this IPI. */ preempt_fold_need_resched(); + + if (!idle_cpu(smp_processor_id()) || need_resched()) + return; + + irq_enter(); + irq_exit(); } static int valid_task_cpu(struct task_struct *p) @@ -2169,16 +2177,21 @@ static inline void init_schedstats(void) {} void wake_up_new_task(struct task_struct *p) { struct task_struct *parent, *rq_curr; + struct rq *rq, *new_rq; unsigned long flags; - struct rq *rq; parent = p->parent; raw_spin_lock_irqsave(&p->pi_lock, flags); p->state = TASK_RUNNING; - if (unlikely(needs_other_cpu(p, task_cpu(p)))) + /* Task_rq can't change yet on a new task */ + new_rq = rq = task_rq(p); + if (unlikely(needs_other_cpu(p, task_cpu(p)))) { set_task_cpu(p, valid_task_cpu(p)); - rq = __task_rq_lock(p); + new_rq = task_rq(p); + } + + double_rq_lock(rq, new_rq); update_clocks(rq); rq_curr = rq->curr; @@ -2194,29 +2207,30 @@ void wake_up_new_task(struct task_struct *p) * Share the timeslice between parent and child, thus the * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. If it's negative, it won't - * matter since that's the same as being 0. current's time_slice is - * actually in rq_time_slice when it's running, as is its last_ran - * value. rq->rq_deadline is only modified within schedule() so it - * is always equal to current->deadline. + * matter since that's the same as being 0. rq->rq_deadline is only + * modified within schedule() so it is always equal to + * current->deadline. */ - p->last_ran = rq->rq_last_ran; + p->last_ran = rq_curr->last_ran; if (likely(rq_curr->policy != SCHED_FIFO)) { - rq->rq_time_slice /= 2; - if (unlikely(rq->rq_time_slice < RESCHED_US)) { + rq_curr->time_slice /= 2; + if (unlikely(rq_curr->time_slice < RESCHED_US)) { /* * Forking task has run out of timeslice. Reschedule it and * start its child with a new time slice and deadline. The * child will end up running first because its deadline will * be slightly earlier. */ - rq->rq_time_slice = 0; + rq_curr->time_slice = 0; __set_tsk_resched(rq_curr); - time_slice_expired(p, rq); + time_slice_expired(p, new_rq); if (suitable_idle_cpus(p)) resched_best_idle(p, task_cpu(p)); + else if (unlikely(rq != new_rq)) + try_preempt(p, new_rq); } else { - p->time_slice = rq->rq_time_slice; - if (rq_curr == parent && !suitable_idle_cpus(p)) { + p->time_slice = rq_curr->time_slice; + if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This @@ -2224,13 +2238,14 @@ void wake_up_new_task(struct task_struct *p) */ __set_tsk_resched(rq_curr); } else - try_preempt(p, rq); + try_preempt(p, new_rq); } } else { - time_slice_expired(p, rq); - try_preempt(p, rq); + time_slice_expired(p, new_rq); + try_preempt(p, new_rq); } - task_rq_unlock(rq, p, &flags); + double_rq_unlock(rq, new_rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -3006,7 +3021,7 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - rq->rq_last_ran; + long account_ns = rq->clock_task - p->last_ran; struct task_struct *idle = rq->idle; unsigned long account_pc; @@ -3029,14 +3044,14 @@ update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { + if (p->policy != SCHED_FIFO && p != idle) { s64 time_diff = rq->clock - rq->timekeep_clock; niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); + p->time_slice -= NS_TO_US(time_diff); } - rq->rq_last_ran = rq->clock_task; + p->last_ran = rq->clock_task; rq->timekeep_clock = rq->clock; } @@ -3048,7 +3063,7 @@ ts_account: static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - rq->rq_last_ran; + long account_ns = rq->clock_task - p->last_ran; struct task_struct *idle = rq->idle; unsigned long account_pc; @@ -3066,14 +3081,14 @@ update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { + if (p->policy != SCHED_FIFO && p != idle) { s64 time_diff = rq->clock - rq->timekeep_clock; niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); + p->time_slice -= NS_TO_US(time_diff); } - rq->rq_last_ran = rq->clock_task; + p->last_ran = rq->clock_task; rq->timekeep_clock = rq->clock; } @@ -3094,7 +3109,7 @@ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) */ if (p == rq->curr && task_on_rq_queued(p)) { update_rq_clock(rq); - ns = rq->clock_task - rq->rq_last_ran; + ns = rq->clock_task - p->last_ran; if (unlikely((s64)ns < 0)) ns = 0; } @@ -3305,31 +3320,31 @@ static inline void no_iso_tick(struct rq *rq, int ticks) /* This manages tasks that have run out of timeslice during a scheduler_tick */ static void task_running_tick(struct rq *rq) { - struct task_struct *p; + struct task_struct *p = rq->curr; /* * If a SCHED_ISO task is running we increment the iso_ticks. In * order to prevent SCHED_ISO tasks from causing starvation in the * presence of true RT tasks we account those as iso_ticks as well. */ - if (rt_queue(rq) || rq_running_iso(rq)) + if (rt_task(p) || task_running_iso(p)) iso_tick(rq); else no_iso_tick(rq, 1); /* SCHED_FIFO tasks never run out of timeslice. */ - if (rq->rq_policy == SCHED_FIFO) + if (p->policy == SCHED_FIFO) return; - if (iso_queue(rq)) { - if (rq_running_iso(rq)) { + if (iso_task(p)) { + if (task_running_iso(p)) { if (rq->iso_refractory) { - /* - * SCHED_ISO task is running as RT and limit - * has been hit. Force it to reschedule as - * SCHED_NORMAL by zeroing its time_slice - */ - rq->rq_time_slice = 0; + /* + * SCHED_ISO task is running as RT and limit + * has been hit. Force it to reschedule as + * SCHED_NORMAL by zeroing its time_slice + */ + p->time_slice = 0; } } else if (!rq->iso_refractory) { /* Can now run again ISO. Reschedule to pick up prio */ @@ -3343,16 +3358,9 @@ static void task_running_tick(struct rq *rq) * run out of time slice in the interim. Otherwise, if they have * less than RESCHED_US μs of time slice left they will be rescheduled. */ - if (rq->dither) { - if (rq->rq_time_slice > HALF_JIFFY_US) - return; - else - rq->rq_time_slice = 0; - } else if (rq->rq_time_slice >= RESCHED_US) - return; + if (p->time_slice - rq->dither >= RESCHED_US) + return; out_resched: - p = rq->curr; - rq_lock(rq); __set_tsk_resched(p); rq_unlock(rq); @@ -3510,24 +3518,29 @@ static inline void check_deadline(struct task_struct *p, struct rq *rq) * is thus done here in an extremely simple first come best fit manner. * * This iterates over runqueues in cache locality order. In interactive mode - * it iterates over all CPUs and finds the task with the earliest deadline. + * it iterates over all CPUs and finds the task with the best key/deadline. * In non-interactive mode it will only take a task if it's from the current * runqueue or a runqueue with more tasks than the current one with a better - * deadline. + * key/deadline. */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { struct task_struct *edt = idle; - u64 earliest_deadline = ~0ULL; struct rq *locked = NULL; int i, best_entries = 0; + u64 best_key = ~0ULL; for (i = 0; i < num_possible_cpus(); i++) { struct rq *other_rq = rq_order(rq, i); int entries = other_rq->sl->entries; struct task_struct *p; + u64 key; + /* + * Check for queued entres lockless first. The local runqueue + * is locked so entries will always be accurate. + */ if (!sched_interactive) { if (entries <= best_entries) continue; @@ -3536,8 +3549,13 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct * /* if (i) implies other_rq != rq */ if (i) { + /* Check for best id queued lockless first */ + if (other_rq->best_key >= best_key) + continue; + if (unlikely(!trylock_rq(rq, other_rq))) continue; + /* Need to reevaluate entries after locking */ entries = other_rq->sl->entries; if (unlikely(!entries)) { @@ -3545,14 +3563,15 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct * continue; } } - p = other_rq->node.next[0]->value; - - if (!deadline_before(p->deadline, earliest_deadline)) { + key = other_rq->node.next[0]->key; + /* Reevaluate key after locking */ + if (unlikely(key >= best_key)) { if (i) unlock_rq(other_rq); continue; } + p = other_rq->node.next[0]->value; if (!smt_schedule(p, rq)) { if (i) unlock_rq(other_rq); @@ -3571,7 +3590,7 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct * } best_entries = entries; - earliest_deadline = p->deadline; + best_key = key; edt = p; } @@ -3637,10 +3656,8 @@ static inline void schedule_debug(struct task_struct *prev) */ static inline void set_rq_task(struct rq *rq, struct task_struct *p) { - rq->rq_time_slice = p->time_slice; rq->rq_deadline = p->deadline; - rq->rq_last_ran = p->last_ran = rq->clock_task; - rq->rq_policy = p->policy; + p->last_ran = rq->clock_task; rq->rq_prio = p->prio; #ifdef CONFIG_SMT_NICE rq->rq_mm = p->mm; @@ -3651,7 +3668,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p) static void reset_rq_task(struct rq *rq, struct task_struct *p) { rq->rq_deadline = p->deadline; - rq->rq_policy = p->policy; rq->rq_prio = p->prio; #ifdef CONFIG_SMT_NICE rq->rq_smt_bias = p->smt_bias; @@ -3763,6 +3779,7 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; + idle = rq->idle; /* * do_exit() calls schedule() with preemption disabled as an exception; @@ -3787,6 +3804,22 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); rq_lock(rq); +#ifdef CONFIG_SMP + if (rq->preempt) { + /* + * Make sure resched_curr hasn't triggered a preemption + * locklessly on a task that has since scheduled away. Spurious + * wakeup of idle is okay though. + */ + if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { + rq->preempt = NULL; + clear_preempt_need_resched(); + rq_unlock_irq(rq); + return; + } + rq->preempt = NULL; + } +#endif switch_count = &prev->nivcsw; if (!preempt && prev->state) { @@ -3821,20 +3854,15 @@ static void __sched notrace __schedule(bool preempt) update_clocks(rq); update_cpu_clock_switch(rq, prev); if (rq->clock - rq->last_tick > HALF_JIFFY_NS) - rq->dither = false; + rq->dither = 0; else - rq->dither = true; + rq->dither = HALF_JIFFY_US; clear_tsk_need_resched(prev); clear_preempt_need_resched(); - idle = rq->idle; if (idle != prev) { - /* Update all the information stored on struct rq */ - prev->time_slice = rq->rq_time_slice; - prev->deadline = rq->rq_deadline; check_deadline(prev, rq); - prev->last_ran = rq->clock_task; return_task(prev, rq, cpu, deactivate); } @@ -5059,6 +5087,7 @@ SYSCALL_DEFINE0(sched_yield) p = current; rq = this_rq_lock(); + time_slice_expired(p, rq); schedstat_inc(task_rq(p), yld_count); /* @@ -5173,6 +5202,7 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { + struct task_struct *rq_p; struct rq *rq, *p_rq; unsigned long flags; int yielded = 0; @@ -5192,18 +5222,19 @@ again: } double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { + if (unlikely(task_rq(p) != p_rq)) { double_rq_unlock(rq, p_rq); goto again; } yielded = 1; - if (p->deadline > rq->rq_deadline) - p->deadline = rq->rq_deadline; - p->time_slice += rq->rq_time_slice; - rq->rq_time_slice = 0; + rq_p = rq->curr; + if (p->deadline > rq_p->deadline) + p->deadline = rq_p->deadline; + p->time_slice += rq_p->time_slice; if (p->time_slice > timeslice()) p->time_slice = timeslice(); + time_slice_expired(rq_p, rq); if (preempt && rq != p_rq) resched_task(p_rq->curr); double_rq_unlock(rq, p_rq); @@ -5818,15 +5849,17 @@ static void bind_zero(int src_cpu) do_each_thread(t, p) { if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { - cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p)); - cpumask_set_cpu(0, tsk_cpus_allowed(p)); + bool local = (task_cpu(p) == src_cpu); + + /* task_running is the cpu stopper thread */ + if (local && task_running(task_rq(p), p)) + continue; + atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p)); + atomic_set_cpu(0, tsk_cpus_allowed(p)); p->zerobound = true; bound++; - if (task_cpu(p) == src_cpu) { + if (local) set_task_cpu(p, 0); - if (task_running(task_rq(p), p)) - resched_task(p); - } } } while_each_thread(t, p); @@ -7552,7 +7585,7 @@ void __init sched_init(void) rq->last_jiffy = jiffies; rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = rq->iowait_pc = rq->idle_pc = 0; - rq->dither = false; + rq->dither = 0; set_rq_task(rq, &init_task); rq->iso_ticks = 0; rq->iso_refractory = false; @@ -7800,21 +7833,7 @@ void vtime_account_system_irqsafe(struct task_struct *tsk) local_irq_restore(flags); } EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_system(prev); - - vtime_account_user(prev); - arch_vtime_task_switch(prev); -} -#endif - -#else +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ /* * Perform (stime * rtime) / total, but avoid multiplication overflow by * losing precision when the numbers are big. @@ -7936,7 +7955,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ void init_idle_bootup_task(struct task_struct *idle) {} diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h index d2d6696..f8d0d58 100644 --- a/kernel/sched/MuQSS.h +++ b/kernel/sched/MuQSS.h @@ -22,11 +22,11 @@ struct rq { /* Stored data about rq->curr to work outside rq lock */ u64 rq_deadline; - unsigned int rq_policy; - int rq_time_slice; - u64 rq_last_ran; int rq_prio; + /* Best queued id for use outside lock */ + u64 best_key; + unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ u64 niffies; /* Last time this RQ updated rq clock */ @@ -47,6 +47,8 @@ struct rq { skiplist_node node; skiplist *sl; #ifdef CONFIG_SMP + struct task_struct *preempt; /* Preempt triggered on this task */ + int cpu; /* cpu of this runqueue */ bool online; @@ -78,7 +80,7 @@ struct rq { u64 clock, old_clock, last_tick; u64 clock_task; - bool dither; + int dither; int iso_ticks; bool iso_refractory;