diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt index c0841e2..2521d1a 100644 --- a/Documentation/scheduler/sched-MuQSS.txt +++ b/Documentation/scheduler/sched-MuQSS.txt @@ -33,7 +33,7 @@ need to scan the entire runqueue looking for the process with the earliest deadline and schedule that next, regardless of which CPU it originally came from. This made BFS deterministic with respect to latency and provided guaranteed latencies dependent on number of processes and CPUs. The single -runqueue, however, meant that all CPUs would complete for the single lock +runqueue, however, meant that all CPUs would compete for the single lock protecting it, which would lead to increasing lock contention as the number of CPUs rose and appeared to limit scalability of common workloads beyond 16 logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously diff --git a/include/linux/sched.h b/include/linux/sched.h index de6a4e9..7b9e719 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1619,7 +1619,8 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; #ifdef CONFIG_SCHED_MUQSS - unsigned long utime_pc, stime_pc; + /* Unbanked cpu time */ + unsigned long utime_ns, stime_ns; #endif cputime_t gtime; struct prev_cputime prev_cputime; diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c index 1700be5..99f02a4 100644 --- a/kernel/sched/MuQSS.c +++ b/kernel/sched/MuQSS.c @@ -121,10 +121,11 @@ * Some helpers for converting to/from various scales. Use shifts to get * approximate multiples of ten for less overhead. */ -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define JIFFY_NS (1000000000 / HZ) -#define HALF_JIFFY_NS (1000000000 / HZ / 2) -#define HALF_JIFFY_US (1000000 / HZ / 2) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) +#define JIFFY_NS (1073741824 / HZ) +#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) +#define HALF_JIFFY_NS (1073741824 / HZ / 2) +#define HALF_JIFFY_US (1048576 / HZ / 2) #define MS_TO_NS(TIME) ((TIME) << 20) #define MS_TO_US(TIME) ((TIME) << 10) #define NS_TO_MS(TIME) ((TIME) >> 20) @@ -134,7 +135,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "MuQSS CPU scheduler v0.112 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n"); } /* @@ -175,10 +176,17 @@ static inline int timeslice(void) * variables and a cpu bitmap set atomically. */ struct global_rq { - atomic_t nr_running; - atomic_t nr_uninterruptible; - atomic64_t nr_switches; - atomic_t qnr; /* queued not running */ +#ifdef CONFIG_SMP + atomic_t nr_running ____cacheline_aligned_in_smp; + atomic_t nr_uninterruptible ____cacheline_aligned_in_smp; + atomic64_t nr_switches ____cacheline_aligned_in_smp; + atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */ +#else + atomic_t nr_running ____cacheline_aligned; + atomic_t nr_uninterruptible ____cacheline_aligned; + atomic64_t nr_switches ____cacheline_aligned; + atomic_t qnr ____cacheline_aligned; /* queued not running */ +#endif #ifdef CONFIG_SMP cpumask_t cpu_idle_map; #endif @@ -217,7 +225,11 @@ static struct root_domain def_root_domain; #endif /* CONFIG_SMP */ /* There can be only one */ -static struct global_rq grq; +#ifdef CONFIG_SMP +static struct global_rq grq ____cacheline_aligned_in_smp; +#else +static struct global_rq grq ____cacheline_aligned; +#endif static DEFINE_MUTEX(sched_hotcpu_mutex); @@ -252,27 +264,6 @@ int __weak arch_sd_sibling_asym_packing(void) struct rq *uprq; #endif /* CONFIG_SMP */ -/* - * Sanity check should sched_clock return bogus values. We make sure it does - * not appear to go backwards, and use jiffies to determine the maximum and - * minimum it could possibly have increased, and round down to the nearest - * jiffy when it falls outside this. - */ -static inline void niffy_diff(s64 *niff_diff, int jiff_diff) -{ - unsigned long min_diff, max_diff; - - if (jiff_diff > 1) - min_diff = JIFFIES_TO_NS(jiff_diff - 1); - else - min_diff = 1; - /* Round up to the nearest tick for maximum */ - max_diff = JIFFIES_TO_NS(jiff_diff + 1); - - if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) - *niff_diff = min_diff; -} - #ifdef CONFIG_SMP static inline int cpu_of(struct rq *rq) { @@ -323,20 +314,26 @@ static inline void update_rq_clock(struct rq *rq) */ static inline void update_clocks(struct rq *rq) { - s64 ndiff; + s64 ndiff, minndiff; long jdiff; update_rq_clock(rq); ndiff = rq->clock - rq->old_clock; - if (unlikely(!ndiff)) - return; rq->old_clock = rq->clock; - ndiff -= rq->niffies - rq->last_niffy; jdiff = jiffies - rq->last_jiffy; - niffy_diff(&ndiff, jdiff); - rq->last_jiffy += jdiff; + + /* Subtract any niffies added by balancing with other rqs */ + ndiff -= rq->niffies - rq->last_niffy; + minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; + if (minndiff < 0) + minndiff = 0; + ndiff = max(ndiff, minndiff); rq->niffies += ndiff; rq->last_niffy = rq->niffies; + if (jdiff) { + rq->last_jiffy += jdiff; + rq->last_jiffy_niffies = rq->niffies; + } } static inline int task_current(struct rq *rq, struct task_struct *p) @@ -585,6 +582,121 @@ static inline void __task_rq_unlock(struct rq *rq) rq_unlock(rq); } +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, mask) \ + ({ \ + typeof(ptr) _ptr = (ptr); \ + typeof(mask) _mask = (mask); \ + typeof(*_ptr) _old, _val = *_ptr; \ + \ + for (;;) { \ + _old = cmpxchg(_ptr, _val, _val | _mask); \ + if (_old == _val) \ + break; \ + _val = _old; \ + } \ + _old; \ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif +#endif + +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_q(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } +} + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { next->on_cpu = 1; @@ -608,15 +720,17 @@ void resched_task(struct task_struct *p) if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); set_preempt_need_resched(); return; } - smp_send_reschedule(cpu); + if (set_nr_and_not_polling(p)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } /* @@ -694,11 +808,6 @@ static inline bool deadline_before(u64 deadline, u64 time) return (deadline < time); } -static inline bool deadline_after(u64 deadline, u64 time) -{ - return (deadline > time); -} - /* * Deadline is "now" in niffies + (offset by priority). Setting the deadline * is the key to everything. It distributes cpu fairly amongst tasks of the @@ -906,7 +1015,7 @@ static inline int queued_notrunning(void) /* Entered with rq locked */ static inline void resched_if_idle(struct rq *rq) { - if (rq_idle(rq) && rq->online) + if (rq_idle(rq)) resched_task(rq->curr); } @@ -1031,20 +1140,26 @@ static bool suitable_idle_cpus(struct task_struct *p) */ static void resched_curr(struct rq *rq) { + int cpu; + if (test_tsk_need_resched(rq->curr)) return; rq->preempt = rq->curr; + cpu = rq->cpu; /* We're doing this without holding the rq lock if it's not task_rq */ - set_tsk_need_resched(rq->curr); - if (rq_local(rq)) { + if (cpu == smp_processor_id()) { + set_tsk_need_resched(rq->curr); set_preempt_need_resched(); return; } - smp_send_reschedule(rq->cpu); + if (set_nr_and_not_polling(rq->curr)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } #define CPUIDLE_DIFF_THREAD (1) @@ -1253,7 +1368,7 @@ static void activate_task(struct task_struct *p, struct rq *rq) if (unlikely(prof_on == SLEEP_PROFILING)) { if (p->state == TASK_UNINTERRUPTIBLE) profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (rq->clock_task - p->last_ran) >> 20); + (rq->niffies - p->last_ran) >> 20); } p->prio = effective_prio(p); @@ -1711,30 +1826,33 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) return ret; } +#ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; -void wake_up_if_idle(int cpu) +void sched_ttwu_pending(void) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; unsigned long flags; - rcu_read_lock(); + if (!llist) + return; - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; + raw_spin_lock_irqsave(&rq->lock, flags); - rq_lock_irqsave(rq, &flags); - if (likely(is_idle_task(rq->curr))) - smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ - rq_unlock_irqrestore(rq, &flags); + while (llist) { + int wake_flags = 0; -out: - rcu_read_unlock(); -} + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); -static bool sched_smp_initialized __read_mostly; + ttwu_do_activate(rq, p, wake_flags); + } + + raw_spin_unlock_irqrestore(&rq->lock, flags); +} -#ifdef CONFIG_SMP void scheduler_ipi(void) { /* @@ -1744,13 +1862,63 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (!idle_cpu(smp_processor_id()) || need_resched()) + if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) return; + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ irq_enter(); + sched_ttwu_pending(); irq_exit(); } +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +{ + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } +} + +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + rq_lock_irqsave(rq, &flags); + if (likely(is_idle_task(rq->curr))) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + rq_unlock_irqrestore(rq, &flags); + } + +out: + rcu_read_unlock(); +} + static int valid_task_cpu(struct task_struct *p) { cpumask_t valid_mask; @@ -1761,8 +1929,9 @@ static int valid_task_cpu(struct task_struct *p) cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask); if (unlikely(!cpumask_weight(&valid_mask))) { + /* Hotplug boot threads do this before the CPU is up */ WARN_ON(sched_smp_initialized); - return smp_processor_id(); + return cpumask_any(tsk_cpus_allowed(p)); } return cpumask_any(&valid_mask); } @@ -1827,6 +1996,13 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); +#if defined(CONFIG_SMP) + if (!cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ + ttwu_queue_remote(p, cpu, wake_flags); + return; + } +#endif rq_lock(rq); ttwu_do_activate(rq, p, wake_flags); rq_unlock(rq); @@ -2048,8 +2224,8 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) p->utimescaled = p->stimescaled = p->sched_time = - p->stime_pc = - p->utime_pc = 0; + p->stime_ns = + p->utime_ns = 0; skiplist_node_init(&p->node); /* @@ -2803,7 +2979,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= steal; } #endif - rq->clock_task += delta; } @@ -2895,86 +3070,89 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) } /* - * On each tick, see what percentage of that tick was attributed to each - * component and add the percentage to the _pc values. Once a _pc value has - * accumulated one tick's worth, account for that. This means the total - * percentage of load components will always be 128 (pseudo 100) per tick. + * On each tick, add the number of nanoseconds to the unbanked variables and + * once one tick's worth has accumulated, account it allowing for accurate + * sub-tick accounting and totals. */ -static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc) +static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) { u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; if (atomic_read(&rq->nr_iowait) > 0) { - rq->iowait_pc += pc; - if (rq->iowait_pc >= 128) { - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128; - rq->iowait_pc %= 128; + rq->iowait_ns += ns; + if (rq->iowait_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->iowait_ns); + cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks; + rq->iowait_ns %= JIFFY_NS; } } else { - rq->idle_pc += pc; - if (rq->idle_pc >= 128) { - cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128; - rq->idle_pc %= 128; + rq->idle_ns += ns; + if (rq->idle_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->idle_ns); + cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks; + rq->idle_ns %= JIFFY_NS; } } acct_update_integrals(idle); } -static void -pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, - unsigned long pc, unsigned long ns) +static void pc_system_time(struct rq *rq, struct task_struct *p, + int hardirq_offset, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->stime_pc += pc; - if (p->stime_pc >= 128) { - int jiffs = p->stime_pc / 128; - - p->stime_pc %= 128; - p->stime += (__force u64)cputime_one_jiffy * jiffs; - p->stimescaled += one_jiffy_scaled * jiffs; - account_group_system_time(p, cputime_one_jiffy * jiffs); + p->stime_ns += ns; + if (p->stime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->stime_ns); + p->stime_ns %= JIFFY_NS; + p->stime += (__force u64)cputime_one_jiffy * ticks; + p->stimescaled += one_jiffy_scaled * ticks; + account_group_system_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); if (hardirq_count() - hardirq_offset) { - rq->irq_pc += pc; - if (rq->irq_pc >= 128) { - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128; - rq->irq_pc %= 128; + rq->irq_ns += ns; + if (rq->irq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->irq_ns); + cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->irq_ns %= JIFFY_NS; } } else if (in_serving_softirq()) { - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } else { - rq->system_pc += pc; - if (rq->system_pc >= 128) { - cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128; - rq->system_pc %= 128; + rq->system_ns += ns; + if (rq->system_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->system_ns); + cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks; + rq->system_ns %= JIFFY_NS; } } acct_update_integrals(p); } -static void pc_user_time(struct rq *rq, struct task_struct *p, - unsigned long pc, unsigned long ns) +static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->utime_pc += pc; - if (p->utime_pc >= 128) { - int jiffs = p->utime_pc / 128; - - p->utime_pc %= 128; - p->utime += (__force u64)cputime_one_jiffy * jiffs; - p->utimescaled += one_jiffy_scaled * jiffs; - account_group_user_time(p, cputime_one_jiffy * jiffs); + p->utime_ns += ns; + if (p->utime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->utime_ns); + p->utime_ns %= JIFFY_NS; + p->utime += (__force u64)cputime_one_jiffy * ticks; + p->utimescaled += one_jiffy_scaled * ticks; + account_group_user_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); @@ -2984,36 +3162,33 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. */ - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } if (task_nice(p) > 0 || idleprio_task(p)) { - rq->nice_pc += pc; - if (rq->nice_pc >= 128) { - cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128; - rq->nice_pc %= 128; + rq->nice_ns += ns; + if (rq->nice_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->nice_ns); + cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks; + rq->nice_ns %= JIFFY_NS; } } else { - rq->user_pc += pc; - if (rq->user_pc >= 128) { - cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128; - rq->user_pc %= 128; + rq->user_ns += ns; + if (rq->user_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->user_ns); + cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks; + rq->user_ns %= JIFFY_NS; } } acct_update_integrals(p); } /* - * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast - * shifts instead of 100 - */ -#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) - -/* * This is called on clock ticks. * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. @@ -3021,38 +3196,29 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - p->last_ran; + s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; - if (unlikely(account_ns < 0) || steal_account_process_tick()) + if (steal_account_process_tick()) goto ts_account; - account_pc = NS_TO_PC(account_ns); - /* Accurate tick timekeeping */ if (user_mode(get_irq_regs())) - pc_user_time(rq, p, account_pc, account_ns); - else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) - pc_system_time(rq, p, HARDIRQ_OFFSET, - account_pc, account_ns); - else - pc_idle_time(rq, idle, account_pc); + pc_user_time(rq, p, account_ns); + else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { + pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); + } else + pc_idle_time(rq, idle, account_ns); if (sched_clock_irqtime) irqtime_account_hi_si(); ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (p->policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - p->time_slice -= NS_TO_US(time_diff); - } + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); - p->last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; + p->last_ran = rq->niffies; } /* @@ -3063,33 +3229,18 @@ ts_account: static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - p->last_ran; + s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; - - if (unlikely(account_ns < 0)) - goto ts_account; - - account_pc = NS_TO_PC(account_ns); /* Accurate subtick timekeeping */ - if (p != idle) { - pc_user_time(rq, p, account_pc, account_ns); - } + if (p != idle) + pc_user_time(rq, p, account_ns); else - pc_idle_time(rq, idle, account_pc); + pc_idle_time(rq, idle, account_ns); -ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (p->policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - p->time_slice -= NS_TO_US(time_diff); - } - - p->last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); } /* @@ -3108,10 +3259,8 @@ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) * thread, breaking clock_gettime(). */ if (p == rq->curr && task_on_rq_queued(p)) { - update_rq_clock(rq); - ns = rq->clock_task - p->last_ran; - if (unlikely((s64)ns < 0)) - ns = 0; + update_clocks(rq); + ns = rq->niffies - p->last_ran; } return ns; @@ -3657,7 +3806,6 @@ static inline void schedule_debug(struct task_struct *prev) static inline void set_rq_task(struct rq *rq, struct task_struct *p) { rq->rq_deadline = p->deadline; - p->last_ran = rq->clock_task; rq->rq_prio = p->prio; #ifdef CONFIG_SMT_NICE rq->rq_mm = p->mm; @@ -3665,15 +3813,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p) #endif } -static void reset_rq_task(struct rq *rq, struct task_struct *p) -{ - rq->rq_deadline = p->deadline; - rq->rq_prio = p->prio; -#ifdef CONFIG_SMT_NICE - rq->rq_smt_bias = p->smt_bias; -#endif -} - #ifdef CONFIG_SMT_NICE static void check_no_siblings(struct rq __maybe_unused *this_rq) {} static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} @@ -3774,6 +3913,7 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; bool deactivate = false; struct rq *rq; + u64 niffies; int cpu; cpu = smp_processor_id(); @@ -3851,7 +3991,13 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nvcsw; } + /* + * Store the niffy value here for use by the next task's last_ran + * below to avoid losing niffies due to update_clocks being called + * again after this point. + */ update_clocks(rq); + niffies = rq->niffies; update_cpu_clock_switch(rq, prev); if (rq->clock - rq->last_tick > HALF_JIFFY_NS) rq->dither = 0; @@ -3881,13 +4027,15 @@ static void __sched notrace __schedule(bool preempt) } } + set_rq_task(rq, next); + next->last_ran = niffies; + if (likely(prev != next)) { /* * Don't reschedule an idle task or deactivated tasks */ if (prev != idle && !deactivate) resched_suitable_idle(prev); - set_rq_task(rq, next); if (next != idle) check_siblings(rq); else @@ -4203,7 +4351,7 @@ void set_user_nice(struct task_struct *p, long nice) if (new_static < old_static) try_preempt(p, rq); } else if (task_running(rq, p)) { - reset_rq_task(rq, p); + set_rq_task(rq, p); if (old_static < new_static) resched_task(p); } @@ -4350,7 +4498,7 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, p->prio = p->normal_prio; if (task_running(rq, p)) { - reset_rq_task(rq, p); + set_rq_task(rq, p); resched_task(p); } else if (task_queued(p)) { dequeue_task(rq, p, DEQUEUE_SAVE); @@ -5497,7 +5645,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - idle->last_ran = rq->clock_task; + idle->last_ran = rq->niffies; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ idle->prio = PRIO_LIMIT; @@ -5563,52 +5711,6 @@ int task_can_attach(struct task_struct *p, return ret; } -void wake_q_add(struct wake_q_head *head, struct task_struct *task) -{ - struct wake_q_node *node = &task->wake_q; - - /* - * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the - * wakeup due to that. - * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). - */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) - return; - - get_task_struct(task); - - /* - * The head is context local, there can be no concurrency. - */ - *head->lastp = node; - head->lastp = &node->next; -} - -void wake_up_q(struct wake_q_head *head) -{ - struct wake_q_node *node = head->first; - - while (node != WAKE_Q_TAIL) { - struct task_struct *task; - - task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); - /* task can safely be re-inserted now */ - node = node->next; - task->wake_q.next = NULL; - - /* - * wake_up_process() implies a wmb() to pair with the queueing - * in wake_q_add() so as not to miss wakeups. - */ - wake_up_process(task); - put_task_struct(task); - } -} - void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -5719,8 +5821,10 @@ void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - set_tsk_need_resched(cpu_rq(cpu)->idle); - smp_send_reschedule(cpu); + if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void wake_up_nohz_cpu(int cpu) @@ -7581,10 +7685,10 @@ void __init sched_init(void) skiplist_init(&rq->node); rq->sl = new_skiplist(&rq->node); raw_spin_lock_init(&rq->lock); - rq->niffies = 0; + rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; rq->last_jiffy = jiffies; - rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = - rq->iowait_pc = rq->idle_pc = 0; + rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = + rq->iowait_ns = rq->idle_ns = 0; rq->dither = 0; set_rq_task(rq, &init_task); rq->iso_ticks = 0; @@ -7806,6 +7910,25 @@ void set_curr_task(int cpu, struct task_struct *p) /* * Use precise platform statistics if available: */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_common_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -7833,6 +7956,26 @@ void vtime_account_system_irqsafe(struct task_struct *tsk) local_irq_restore(flags); } EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); +} +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + #else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ /* * Perform (stime * rtime) / total, but avoid multiplication overflow by diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h index f8d0d58..4e3115d 100644 --- a/kernel/sched/MuQSS.h +++ b/kernel/sched/MuQSS.h @@ -31,6 +31,7 @@ struct rq { unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ u64 niffies; /* Last time this RQ updated rq clock */ u64 last_niffy; /* Last niffies as updated by local clock */ + u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ u64 load_update; /* When we last updated load */ unsigned long load_avg; /* Rolling load average */ @@ -39,9 +40,8 @@ struct rq { int rq_smt_bias; /* Policy/nice level bias across smt siblings */ #endif /* Accurate timekeeping data */ - u64 timekeep_clock; - unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, - iowait_pc, idle_pc; + unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, + iowait_ns, idle_ns; atomic_t nr_iowait; skiplist_node node; @@ -104,6 +104,11 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -208,12 +213,11 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -static inline void sched_ttwu_pending(void) { } - #ifdef CONFIG_SMP - +extern void sched_ttwu_pending(void); extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); - +#else +static inline void sched_ttwu_pending(void) { } #endif #ifdef CONFIG_CPU_IDLE