diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c index 4346276..f3ee97b 100644 --- a/kernel/sched/MuQSS.c +++ b/kernel/sched/MuQSS.c @@ -117,7 +117,7 @@ #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -#define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1) +#define ISO_PERIOD (5 * HZ) #define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) #define STOP_PRIO (MAX_RT_PRIO - 1) @@ -139,7 +139,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "MuQSS CPU scheduler v0.108 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.110 by Con Kolivas.\n"); } /* @@ -177,7 +177,7 @@ static inline int timeslice(void) /* * The global runqueue data that all CPUs work off. Contains either atomic - * variables or iso variables protected by iso_lock. + * variables and a cpu bitmap set atomically. */ struct global_rq { atomic_t nr_running; @@ -186,11 +186,7 @@ struct global_rq { atomic_t qnr; /* queued not running */ #ifdef CONFIG_SMP cpumask_t cpu_idle_map; - bool idle_cpus; #endif - raw_spinlock_t iso_lock; - int iso_ticks; - bool iso_refractory; }; #ifdef CONFIG_SMP @@ -261,8 +257,6 @@ int __weak arch_sd_sibling_asym_packing(void) struct rq *uprq; #endif /* CONFIG_SMP */ -static inline void update_rq_clock(struct rq *rq); - /* * Sanity check should sched_clock return bogus values. We make sure it does * not appear to go backwards, and use jiffies to determine the maximum and @@ -296,27 +290,6 @@ static inline int cpu_of(struct rq *rq) } #endif -/* - * Niffies are a globally increasing nanosecond counter. Whenever a runqueue - * clock is updated with the rq->lock held, it is an opportunity to update the - * niffies value. Any CPU can update it by adding how much its clock has - * increased since it last updated niffies, minus any added niffies by other - * CPUs. - */ -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - rq->old_clock = rq->clock; - jdiff = jiffies - rq->last_jiffy; - niffy_diff(&ndiff, jdiff); - rq->last_jiffy += jdiff; - rq->niffies += ndiff; -} - #include "stats.h" #ifndef prepare_arch_switch @@ -347,6 +320,30 @@ static inline void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +/* + * Niffies are a globally increasing nanosecond counter. They're only used by + * update_load_avg and time_slice_expired, however deadlines are based on them + * across CPUs. Update them whenever we will call one of those functions, and + * synchronise them across CPUs whenever we hold both runqueue locks. + */ +static inline void update_clocks(struct rq *rq) +{ + s64 ndiff; + long jdiff; + + update_rq_clock(rq); + ndiff = rq->clock - rq->old_clock; + if (unlikely(!ndiff)) + return; + rq->old_clock = rq->clock; + ndiff -= rq->niffies - rq->last_niffy; + jdiff = jiffies - rq->last_jiffy; + niffy_diff(&ndiff, jdiff); + rq->last_jiffy += jdiff; + rq->niffies += ndiff; + rq->last_niffy = rq->niffies; +} + static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; @@ -402,6 +399,19 @@ static inline struct rq *this_rq_lock(void) } /* + * Any time we have two runqueues locked we use that as an opportunity to + * synchronise niffies to the highest value as idle ticks may have artificially + * kept niffies low on one CPU and the truth can only be later. + */ +static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) +{ + if (rq1->niffies > rq2->niffies) + rq2->niffies = rq1->niffies; + else + rq1->niffies = rq2->niffies; +} + +/* * double_rq_lock - safely lock two runqueues * * Note this does not disable interrupts like task_rq_lock, @@ -432,6 +442,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquire(rq2->lock); /* Fake it out ;) */ } else __double_rq_lock(rq1, rq2); + synchronise_niffies(rq1, rq2); } /* @@ -462,6 +473,7 @@ static inline void lock_second_rq(struct rq *rq1, struct rq *rq2) raw_spin_unlock(&rq1->lock); __double_rq_lock(rq1, rq2); } + synchronise_niffies(rq1, rq2); } static inline void lock_all_rqs(void) @@ -489,11 +501,12 @@ static inline void unlock_all_rqs(void) } /* Specially nest trylock an rq */ -static inline bool trylock_rq(struct rq *rq) +static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) { if (unlikely(!do_raw_spin_trylock(&rq->lock))) return false; spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); + synchronise_niffies(this_rq, rq); return true; } @@ -510,12 +523,6 @@ static inline void rq_lock_irq(struct rq *rq) raw_spin_lock_irq(&rq->lock); } -static inline void time_lock_rq(struct rq *rq) -{ - rq_lock(rq); - update_clocks(rq); -} - static inline void rq_unlock_irq(struct rq *rq) __releases(rq->lock) { @@ -553,15 +560,6 @@ static inline struct rq return rq; } -static inline struct rq -*time_task_rq_lock(struct task_struct *p, unsigned long *flags) -{ - struct rq *rq = task_rq_lock(p, flags); - - update_clocks(rq); - return rq; -} - static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) __releases(p->pi_lock) @@ -626,13 +624,6 @@ void resched_task(struct task_struct *p) smp_send_reschedule(cpu); } -/* Entered with rq locked */ -static inline void resched_if_idle(struct rq *rq) -{ - if (rq_idle(rq)) - resched_task(rq->curr); -} - /* * A task that is not running or queued will not have a node set. * A task that is queued but not running will have a node set. @@ -643,7 +634,8 @@ static inline bool task_queued(struct task_struct *p) return !skiplist_node_empty(&p->node); } -static void enqueue_task(struct task_struct *p, struct rq *rq); +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +static inline void resched_if_idle(struct rq *rq); static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { @@ -678,24 +670,27 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * remote lock we're migrating it to before enabling them. */ if (unlikely(task_on_rq_migrating(prev))) { - struct rq *rq2 = task_rq(prev); - + /* + * We move the ownership of prev to the new cpu now. ttwu can't + * activate prev to the wrong cpu since it has to grab this + * runqueue in ttwu_remote. + */ + task_thread_info(prev)->cpu = prev->wake_cpu; raw_spin_unlock(&rq->lock); - rq_lock(rq2); + raw_spin_lock(&prev->pi_lock); + rq = __task_rq_lock(prev); /* Check that someone else hasn't already queued prev */ - if (likely(task_on_rq_migrating(prev) && !task_queued(prev))) { - enqueue_task(prev, rq2); + if (likely(!task_queued(prev))) { + enqueue_task(rq, prev, 0); prev->on_rq = TASK_ON_RQ_QUEUED; /* Wake up the CPU if it's not already running */ - resched_if_idle(rq2); + resched_if_idle(rq); } - rq_unlock(rq2); - - local_irq_enable(); - } else + raw_spin_unlock(&prev->pi_lock); + } #endif - raw_spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); } static inline bool deadline_before(u64 deadline, u64 time) @@ -747,10 +742,14 @@ static inline int rq_load(struct rq *rq) return rq->sl->entries + !rq_idle(rq); } +static inline bool rq_local(struct rq *rq); + /* * Update the load average for feeding into cpu frequency governors. Use a * rough estimate of a rolling average with ~ time constant of 32ms. * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 + * Make sure a call to update_clocks has been made before calling this to get + * an updated rq->niffies. */ static void update_load_avg(struct rq *rq) { @@ -764,9 +763,11 @@ static void update_load_avg(struct rq *rq) load = 0; load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; rq->load_avg = load; - } + } else + return; + rq->load_update = rq->clock; - if (likely(rq->cpu == smp_processor_id())) + if (likely(rq_local(rq))) cpufreq_trigger(rq->niffies, rq->load_avg); } @@ -776,10 +777,12 @@ static void update_load_avg(struct rq *rq) * and does not require a full look up. Thus it occurs in O(k) time where k * is the "level" of the list the task was stored at - usually < 4, max 8. */ -static void dequeue_task(struct task_struct *p, struct rq *rq) +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { skiplist_delete(rq->sl, &p->node); - sched_info_dequeued(task_rq(p), p); + update_clocks(rq); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(task_rq(p), p); update_load_avg(rq); } @@ -806,26 +809,15 @@ static bool idleprio_suitable(struct task_struct *p) * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check * that the iso_refractory flag is not set. */ -static bool isoprio_suitable(void) +static inline bool isoprio_suitable(struct rq *rq) { - return !grq.iso_refractory; -} - -/* - * Check to see if p can run on cpu, and if not, whether there are any online - * CPUs it can run on instead. - */ -static inline bool needs_other_cpu(struct task_struct *p, int cpu) -{ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) - return true; - return false; + return !rq->iso_refractory; } /* * Adding to the runqueue. Enter with rq locked. */ -static void enqueue_task(struct task_struct *p, struct rq *rq) +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { unsigned int randseed; u64 sl_id; @@ -833,7 +825,7 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) if (!rt_task(p)) { /* Check it hasn't gotten rt from PI */ if ((idleprio_task(p) && idleprio_suitable(p)) || - (iso_task(p) && isoprio_suitable())) + (iso_task(p) && isoprio_suitable(rq))) p->prio = p->normal_prio; else p->prio = NORMAL_PRIO; @@ -865,17 +857,14 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) * Some architectures don't have better than microsecond resolution * so mask out ~microseconds as the random seed for skiplist insertion. */ + update_clocks(rq); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); randseed = (rq->niffies >> 10) & 0xFFFFFFFF; skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); - sched_info_queued(rq, p); update_load_avg(rq); } -static inline void requeue_task(struct task_struct *p) -{ - sched_info_queued(task_rq(p), p); -} - /* * Returns the relative length of deadline all compared to the shortest * deadline which is that of nice -20. @@ -916,6 +905,17 @@ static inline int queued_notrunning(void) } #ifdef CONFIG_SMP +/* Entered with rq locked */ +static inline void resched_if_idle(struct rq *rq) +{ + if (rq_idle(rq) && rq->online) + resched_task(rq->curr); +} + +static inline bool rq_local(struct rq *rq) +{ + return (rq->cpu == smp_processor_id()); +} #ifdef CONFIG_SMT_NICE static const cpumask_t *thread_cpumask(int cpu); @@ -1039,7 +1039,7 @@ static void resched_curr(struct rq *rq) /* We're doing this without holding the rq lock if it's not task_rq */ set_tsk_need_resched(rq->curr); - if (rq->cpu == smp_processor_id()) { + if (rq_local(rq)) { set_preempt_need_resched(); return; } @@ -1122,31 +1122,36 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return (this_rq->cpu_locality[that_cpu] < 3); } -static bool resched_best_idle(struct task_struct *p) +static struct rq *resched_best_idle(struct task_struct *p, int cpu) { cpumask_t tmpmask; struct rq *rq; int best_cpu; cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); - best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); + best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); rq = cpu_rq(best_cpu); if (!smt_schedule(p, rq)) - return false; + return NULL; /* * Given we do this lockless, do one last check that the rq is still * idle by the time we get here */ if (unlikely(!rq_idle(rq))) - return false; + return NULL; resched_curr(rq); - return true; + return rq; } static inline void resched_suitable_idle(struct task_struct *p) { if (suitable_idle_cpus(p)) - resched_best_idle(p); + resched_best_idle(p, task_cpu(p)); +} + +static inline struct rq *rq_order(struct rq *rq, int cpu) +{ + return rq->rq_order[cpu]; } #else /* CONFIG_SMP */ static inline void set_cpuidle_map(int cpu) @@ -1170,6 +1175,25 @@ static inline void resched_curr(struct rq *rq) { resched_task(rq->curr); } + +static inline void resched_if_idle(struct rq *rq) +{ +} + +static inline bool rq_local(struct rq *rq) +{ + return true; +} + +static inline struct rq *rq_order(struct rq *rq, int cpu) +{ + return rq; +} + +static inline bool smt_schedule(struct task_struct *p, struct rq *rq) +{ + return true; +} #endif /* CONFIG_SMP */ static inline int normal_prio(struct task_struct *p) @@ -1207,7 +1231,7 @@ static int effective_prio(struct task_struct *p) */ static void activate_task(struct task_struct *p, struct rq *rq) { - update_clocks(rq); + resched_if_idle(rq); /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -1224,7 +1248,7 @@ static void activate_task(struct task_struct *p, struct rq *rq) if (task_contributes_to_load(p)) atomic_dec(&grq.nr_uninterruptible); - enqueue_task(p, rq); + enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; atomic_inc(&grq.nr_running); inc_qnr(); @@ -1241,7 +1265,6 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq) p->on_rq = 0; atomic_dec(&grq.nr_running); - update_load_avg(rq); } #ifdef CONFIG_SMP @@ -1273,15 +1296,28 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); + if (task_running(rq, p)) { + /* + * We should only be calling this on a running task if we're + * holding rq lock. + */ + lockdep_assert_held(&rq->lock); + + /* + * We can't change the task_thread_info cpu on a running task + * as p will still be protected by the rq lock of the cpu it + * is still running on so we set the wake_cpu for it to be + * lazily updated once off the cpu. + */ + p->wake_cpu = cpu; + return; + } + if ((queued = task_queued(p))) - dequeue_task(p, rq); - /* - * If a task is running here it will have the cpu updated but will - * have to be forced onto another runqueue within return_task. - */ - task_thread_info(p)->cpu = cpu; + dequeue_task(rq, p, 0); + task_thread_info(p)->cpu = p->wake_cpu = cpu; if (queued) - enqueue_task(p, cpu_rq(cpu)); + enqueue_task(cpu_rq(cpu), p, 0); } #endif /* CONFIG_SMP */ @@ -1291,7 +1327,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) */ static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) { - dequeue_task(p, task_rq(p)); + dequeue_task(task_rq(p), p, 0); set_task_cpu(p, cpu); dec_qnr(); } @@ -1307,18 +1343,18 @@ static inline void return_task(struct task_struct *p, struct rq *rq, deactivate_task(p, rq); else { inc_qnr(); -#if CONFIG_SMP +#ifdef CONFIG_SMP /* * set_task_cpu was called on the running task that doesn't * want to deactivate so it has to be enqueued to a different * CPU and we need its lock. Tag it to be moved with as the * lock is dropped in finish_lock_switch. */ - if (unlikely(task_cpu(p) != cpu)) + if (unlikely(p->wake_cpu != cpu)) p->on_rq = TASK_ON_RQ_MIGRATING; else #endif - enqueue_task(p, rq); + enqueue_task(rq, p, 0); } } @@ -1492,6 +1528,16 @@ can_preempt(struct task_struct *p, int prio, u64 deadline) } #ifdef CONFIG_SMP +/* + * Check to see if p can run on cpu, and if not, whether there are any online + * CPUs it can run on instead. + */ +static inline bool needs_other_cpu(struct task_struct *p, int cpu) +{ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) + return true; + return false; +} #define cpu_online_map (*(cpumask_t *)cpu_online_mask) #ifdef CONFIG_HOTPLUG_CPU /* @@ -1516,7 +1562,7 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq) int i, this_entries = rq_load(this_rq); cpumask_t tmp; - if (suitable_idle_cpus(p) && resched_best_idle(p)) + if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) return; /* IDLEPRIO tasks never preempt anything but idle */ @@ -1655,7 +1701,6 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p); if (likely(task_on_rq_queued(p))) { - update_clocks(rq); ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } @@ -1695,7 +1740,6 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); } -#endif /* * For a task that's just being woken up we have a valuable balancing @@ -1704,28 +1748,48 @@ void scheduler_ipi(void) */ static inline int select_best_cpu(struct task_struct *p) { - struct rq *rq = task_rq(p), *best_rq = rq; unsigned int idlest = ~0U; + struct rq *rq, *best_rq; int i; + if (suitable_idle_cpus(p)) { + int cpu = task_cpu(p); + + if (unlikely(needs_other_cpu(p, cpu))) + cpu = cpumask_any(tsk_cpus_allowed(p)); + rq = resched_best_idle(p, cpu); + if (likely(rq)) + return rq->cpu; + } + best_rq = rq = task_rq(p); + for (i = 0; i < num_possible_cpus(); i++) { struct rq *other_rq = rq->rq_order[i]; int entries; if (!other_rq->online) continue; - if (other_rq != rq && needs_other_cpu(p, other_rq->cpu)) + if (needs_other_cpu(p, other_rq->cpu)) continue; entries = rq_load(other_rq); if (entries >= idlest) continue; idlest = entries; best_rq = other_rq; - if (!idlest) - break; } return best_rq->cpu; } +#else /* CONFIG_SMP */ +static inline int select_best_cpu(struct task_struct *p) +{ + return 0; +} + +static struct rq *resched_best_idle(struct task_struct *p, int cpu) +{ + return NULL; +} +#endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { @@ -2091,15 +2155,10 @@ void wake_up_new_task(struct task_struct *p) if (unlikely(needs_other_cpu(p, task_cpu(p)))) set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); rq = __task_rq_lock(p); + update_clocks(rq); rq_curr = rq->curr; /* - * Reinit new task deadline as its creator deadline could have changed - * since call to dup_task_struct(). - */ - p->deadline = rq->rq_deadline; - - /* * Make sure we do not leak PI boosting priority to the child. */ p->prio = rq_curr->normal_prio; @@ -2130,7 +2189,7 @@ void wake_up_new_task(struct task_struct *p) __set_tsk_resched(rq_curr); time_slice_expired(p, rq); if (suitable_idle_cpus(p)) - resched_best_idle(p); + resched_best_idle(p, task_cpu(p)); } else { p->time_slice = rq->rq_time_slice; if (rq_curr == parent && !suitable_idle_cpus(p)) { @@ -3010,7 +3069,7 @@ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) * thread, breaking clock_gettime(). */ if (p == rq->curr && task_on_rq_queued(p)) { - update_clocks(rq); + update_rq_clock(rq); ns = rq->clock_task - rq->rq_last_ran; if (unlikely((s64)ns < 0)) ns = 0; @@ -3182,37 +3241,12 @@ void account_idle_ticks(unsigned long ticks) } #endif -static inline void grq_iso_lock(void) - __acquires(grq.iso_lock) -{ - raw_spin_lock(&grq.iso_lock); -} - -static inline void grq_iso_unlock(void) - __releases(grq.iso_lock) -{ - raw_spin_unlock(&grq.iso_lock); -} - /* * Functions to test for when SCHED_ISO tasks have used their allocated - * quota as real time scheduling and convert them back to SCHED_NORMAL. - * Where possible, the data is tested lockless, to avoid grabbing iso_lock - * because the occasional inaccurate result won't matter. However the - * tick data is only ever modified under lock. iso_refractory is only simply - * set to 0 or 1 so it's not worth grabbing the lock yet again for that. + * quota as real time scheduling and convert them back to SCHED_NORMAL. All + * data is modified only by the local runqueue during scheduler_tick with + * interrupts disabled. */ -static bool set_iso_refractory(void) -{ - grq.iso_refractory = true; - return grq.iso_refractory; -} - -static bool clear_iso_refractory(void) -{ - grq.iso_refractory = false; - return grq.iso_refractory; -} /* * Test if SCHED_ISO tasks have run longer than their alloted period as RT @@ -3220,35 +3254,27 @@ static bool clear_iso_refractory(void) * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a * slow division. */ -static bool test_ret_isorefractory(struct rq *rq) +static inline void iso_tick(struct rq *rq) { - if (likely(!grq.iso_refractory)) { - if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) - return set_iso_refractory(); - } else { - if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) - return clear_iso_refractory(); + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + rq->iso_ticks += 100; + if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { + rq->iso_refractory = true; + if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) + rq->iso_ticks = ISO_PERIOD * 100; } - return grq.iso_refractory; -} - -static void iso_tick(void) -{ - grq_iso_lock(); - grq.iso_ticks += 100; - grq_iso_unlock(); } /* No SCHED_ISO task was running so decrease rq->iso_ticks */ -static inline void no_iso_tick(void) -{ - if (grq.iso_ticks) { - grq_iso_lock(); - grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; - if (unlikely(grq.iso_refractory && grq.iso_ticks < - ISO_PERIOD * (sched_iso_cpu * 115 / 128))) - clear_iso_refractory(); - grq_iso_unlock(); +static inline void no_iso_tick(struct rq *rq, int ticks) +{ + if (rq->iso_ticks > 0 || rq->iso_refractory) { + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; + if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { + rq->iso_refractory = false; + if (unlikely(rq->iso_ticks < 0)) + rq->iso_ticks = 0; + } } } @@ -3262,28 +3288,31 @@ static void task_running_tick(struct rq *rq) * order to prevent SCHED_ISO tasks from causing starvation in the * presence of true RT tasks we account those as iso_ticks as well. */ - if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { - if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) - iso_tick(); - } else - no_iso_tick(); + if (rt_queue(rq) || rq_running_iso(rq)) + iso_tick(rq); + else + no_iso_tick(rq, 1); + + /* SCHED_FIFO tasks never run out of timeslice. */ + if (rq->rq_policy == SCHED_FIFO) + return; if (iso_queue(rq)) { - if (unlikely(test_ret_isorefractory(rq))) { - if (rq_running_iso(rq)) { - /* - * SCHED_ISO task is running as RT and limit - * has been hit. Force it to reschedule as - * SCHED_NORMAL by zeroing its time_slice - */ - rq->rq_time_slice = 0; + if (rq_running_iso(rq)) { + if (rq->iso_refractory) { + /* + * SCHED_ISO task is running as RT and limit + * has been hit. Force it to reschedule as + * SCHED_NORMAL by zeroing its time_slice + */ + rq->rq_time_slice = 0; } + } else if (!rq->iso_refractory) { + /* Can now run again ISO. Reschedule to pick up prio */ + goto out_resched; } } - /* SCHED_FIFO tasks never run out of timeslice. */ - if (rq->rq_policy == SCHED_FIFO) - return; /* * Tasks that were scheduled in the first half of a tick are not * allowed to run into the 2nd half of the next tick if they will @@ -3297,11 +3326,10 @@ static void task_running_tick(struct rq *rq) rq->rq_time_slice = 0; } else if (rq->rq_time_slice >= RESCHED_US) return; - +out_resched: p = rq->curr; rq_lock(rq); - requeue_task(p); __set_tsk_resched(p); rq_unlock(rq); } @@ -3317,12 +3345,13 @@ void scheduler_tick(void) sched_clock_tick(); update_rq_clock(rq); - update_cpu_clock_tick(rq, rq->curr); update_load_avg(rq); + update_cpu_clock_tick(rq, rq->curr); if (!rq_idle(rq)) task_running_tick(rq); else - no_iso_tick(); + no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy); + rq->last_scheduler_tick = rq->last_jiffy; rq->last_tick = rq->clock; perf_event_task_tick(); } @@ -3405,7 +3434,8 @@ static inline void preempt_latency_stop(int val) { } /* * The time_slice is only refilled when it is empty and that is when we set a - * new deadline. + * new deadline. Make sure update_clocks has been called recently to update + * rq->niffies. */ static void time_slice_expired(struct task_struct *p, struct rq *rq) { @@ -3470,7 +3500,7 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct * int i, best_entries = 0; for (i = 0; i < num_possible_cpus(); i++) { - struct rq *other_rq = rq->rq_order[i]; + struct rq *other_rq = rq_order(rq, i); int entries = other_rq->sl->entries; struct task_struct *p; @@ -3482,7 +3512,7 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct * /* if (i) implies other_rq != rq */ if (i) { - if (unlikely(!trylock_rq(other_rq))) + if (unlikely(!trylock_rq(rq, other_rq))) continue; /* Need to reevaluate entries after locking */ entries = other_rq->sl->entries; @@ -3596,6 +3626,7 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p) static void reset_rq_task(struct rq *rq, struct task_struct *p) { + rq->rq_deadline = p->deadline; rq->rq_policy = p->policy; rq->rq_prio = p->prio; #ifdef CONFIG_SMT_NICE @@ -3787,12 +3818,15 @@ static void __sched notrace __schedule(bool preempt) next = idle; schedstat_inc(rq, sched_goidle); set_cpuidle_map(cpu); + update_load_avg(rq); } else { next = earliest_deadline_task(rq, cpu, idle); if (likely(next->prio != PRIO_LIMIT)) clear_cpuidle_map(cpu); - else + else { set_cpuidle_map(cpu); + update_load_avg(rq); + } } if (likely(prev != next)) { @@ -4061,8 +4095,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (prio > oldprio) resched_task(p); } else if (task_queued(p)) { - dequeue_task(p, rq); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (prio < oldprio) try_preempt(p, rq); } @@ -4094,7 +4128,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = time_task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4112,8 +4146,8 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); if (task_queued(p)) { - dequeue_task(p, rq); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (new_static < old_static) try_preempt(p, rq); } else if (task_running(rq, p)) { @@ -4267,8 +4301,8 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, reset_rq_task(rq, p); resched_task(p); } else if (task_queued(p)) { - dequeue_task(p, rq); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (p->prio < oldprio || p->rt_priority > oldrtprio) try_preempt(p, rq); } @@ -4370,7 +4404,7 @@ recheck: case SCHED_ISO: if (policy == SCHED_ISO) goto out; - if (policy == SCHED_NORMAL) + if (policy != SCHED_NORMAL) return -EPERM; break; case SCHED_BATCH: @@ -4435,7 +4469,6 @@ recheck: task_rq_unlock(rq, p, &flags); goto recheck; } - update_clocks(rq); p->sched_reset_on_fork = reset_on_fork; __setscheduler(p, rq, policy, param->sched_priority, pi); @@ -5003,7 +5036,6 @@ SYSCALL_DEFINE0(sched_yield) p = current; rq = this_rq_lock(); schedstat_inc(task_rq(p), yld_count); - requeue_task(p); /* * Since we are going to call schedule() anyway, there's @@ -5718,9 +5750,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); struct rq *dest_rq = cpu_rq(dest_cpu); + /* Switch rq locks here */ lock_second_rq(rq, dest_rq); set_task_cpu(p, dest_cpu); - rq_unlock(dest_rq); + rq_unlock(rq); + + rq = dest_rq; } out: if (queued && !cpumask_subset(new_mask, &old_mask)) @@ -7469,9 +7504,6 @@ void __init sched_init(void) atomic_set(&grq.nr_running, 0); atomic_set(&grq.nr_uninterruptible, 0); atomic64_set(&grq.nr_switches, 0); - raw_spin_lock_init(&grq.iso_lock); - grq.iso_ticks = 0; - grq.iso_refractory = false; skiplist_node_init(&init_task.node); #ifdef CONFIG_SMP @@ -7500,6 +7532,8 @@ void __init sched_init(void) rq->iowait_pc = rq->idle_pc = 0; rq->dither = false; set_rq_task(rq, &init_task); + rq->iso_ticks = 0; + rq->iso_refractory = false; #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h index 3b5d5db..d2d6696 100644 --- a/kernel/sched/MuQSS.h +++ b/kernel/sched/MuQSS.h @@ -20,13 +20,18 @@ struct rq { raw_spinlock_t lock; - /* Stored data about rq->curr to work outside grq lock */ + /* Stored data about rq->curr to work outside rq lock */ u64 rq_deadline; unsigned int rq_policy; int rq_time_slice; u64 rq_last_ran; int rq_prio; + unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ + unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ + u64 niffies; /* Last time this RQ updated rq clock */ + u64 last_niffy; /* Last niffies as updated by local clock */ + u64 load_update; /* When we last updated load */ unsigned long load_avg; /* Rolling load average */ #ifdef CONFIG_SMT_NICE @@ -50,8 +55,6 @@ struct rq { int *cpu_locality; /* CPU relative cache distance */ struct rq **rq_order; /* RQs ordered by relative cache distance */ - unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ - u64 niffies; /* Last time this RQ updated rq clock */ #ifdef CONFIG_SCHED_SMT cpumask_t thread_mask; bool (*siblings_idle)(struct rq *rq); @@ -77,6 +80,9 @@ struct rq { u64 clock_task; bool dither; + int iso_ticks; + bool iso_refractory; + #ifdef CONFIG_SCHEDSTATS /* latency stats */ @@ -119,6 +125,41 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define raw_rq() raw_cpu_ptr(&runqueues) #endif /* CONFIG_SMP */ +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x20 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index fc0d8270..13bc43d 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); + preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } - preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue;