From ae81014a29538e34d2acc2d70c83b89ca255b01b Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 21 Oct 2016 10:49:23 +1100 Subject: [PATCH 73/80] Convert sub-tick accounting to nanosecond precision to cope with tasks that run for less than 100us. --- include/linux/sched.h | 3 +- kernel/sched/MuQSS.c | 166 ++++++++++++++++++++++++-------------------------- kernel/sched/MuQSS.h | 4 +- 3 files changed, 84 insertions(+), 89 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index de6a4e9..7b9e719 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1619,7 +1619,8 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; #ifdef CONFIG_SCHED_MUQSS - unsigned long utime_pc, stime_pc; + /* Unbanked cpu time */ + unsigned long utime_ns, stime_ns; #endif cputime_t gtime; struct prev_cputime prev_cputime; diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c index 1403437..eb28c35 100644 --- a/kernel/sched/MuQSS.c +++ b/kernel/sched/MuQSS.c @@ -121,10 +121,11 @@ * Some helpers for converting to/from various scales. Use shifts to get * approximate multiples of ten for less overhead. */ -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define JIFFY_NS (1000000000 / HZ) -#define HALF_JIFFY_NS (1000000000 / HZ / 2) -#define HALF_JIFFY_US (1000000 / HZ / 2) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) +#define JIFFY_NS (1073741824 / HZ) +#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) +#define HALF_JIFFY_NS (1073741824 / HZ / 2) +#define HALF_JIFFY_US (1048576 / HZ / 2) #define MS_TO_NS(TIME) ((TIME) << 20) #define MS_TO_US(TIME) ((TIME) << 10) #define NS_TO_MS(TIME) ((TIME) >> 20) @@ -2223,8 +2224,8 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) p->utimescaled = p->stimescaled = p->sched_time = - p->stime_pc = - p->utime_pc = 0; + p->stime_ns = + p->utime_ns = 0; skiplist_node_init(&p->node); /* @@ -3069,86 +3070,89 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) } /* - * On each tick, see what percentage of that tick was attributed to each - * component and add the percentage to the _pc values. Once a _pc value has - * accumulated one tick's worth, account for that. This means the total - * percentage of load components will always be 128 (pseudo 100) per tick. + * On each tick, add the number of nanoseconds to the unbanked variables and + * once one tick's worth has accumulated, account it allowing for accurate + * sub-tick accounting and totals. */ -static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc) +static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) { u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; if (atomic_read(&rq->nr_iowait) > 0) { - rq->iowait_pc += pc; - if (rq->iowait_pc >= 128) { - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128; - rq->iowait_pc %= 128; + rq->iowait_ns += ns; + if (rq->iowait_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->iowait_ns); + cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks; + rq->iowait_ns %= JIFFY_NS; } } else { - rq->idle_pc += pc; - if (rq->idle_pc >= 128) { - cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128; - rq->idle_pc %= 128; + rq->idle_ns += ns; + if (rq->idle_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->idle_ns); + cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks; + rq->idle_ns %= JIFFY_NS; } } acct_update_integrals(idle); } -static void -pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, - unsigned long pc, unsigned long ns) +static void pc_system_time(struct rq *rq, struct task_struct *p, + int hardirq_offset, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->stime_pc += pc; - if (p->stime_pc >= 128) { - int jiffs = p->stime_pc / 128; - - p->stime_pc %= 128; - p->stime += (__force u64)cputime_one_jiffy * jiffs; - p->stimescaled += one_jiffy_scaled * jiffs; - account_group_system_time(p, cputime_one_jiffy * jiffs); + p->stime_ns += ns; + if (p->stime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->stime_ns); + p->stime_ns %= JIFFY_NS; + p->stime += (__force u64)cputime_one_jiffy * ticks; + p->stimescaled += one_jiffy_scaled * ticks; + account_group_system_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); if (hardirq_count() - hardirq_offset) { - rq->irq_pc += pc; - if (rq->irq_pc >= 128) { - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128; - rq->irq_pc %= 128; + rq->irq_ns += ns; + if (rq->irq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->irq_ns); + cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->irq_ns %= JIFFY_NS; } } else if (in_serving_softirq()) { - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } else { - rq->system_pc += pc; - if (rq->system_pc >= 128) { - cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128; - rq->system_pc %= 128; + rq->system_ns += ns; + if (rq->system_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->system_ns); + cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks; + rq->system_ns %= JIFFY_NS; } } acct_update_integrals(p); } -static void pc_user_time(struct rq *rq, struct task_struct *p, - unsigned long pc, unsigned long ns) +static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->utime_pc += pc; - if (p->utime_pc >= 128) { - int jiffs = p->utime_pc / 128; - - p->utime_pc %= 128; - p->utime += (__force u64)cputime_one_jiffy * jiffs; - p->utimescaled += one_jiffy_scaled * jiffs; - account_group_user_time(p, cputime_one_jiffy * jiffs); + p->utime_ns += ns; + if (p->utime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->utime_ns); + p->utime_ns %= JIFFY_NS; + p->utime += (__force u64)cputime_one_jiffy * ticks; + p->utimescaled += one_jiffy_scaled * ticks; + account_group_user_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); @@ -3158,36 +3162,33 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. */ - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } if (task_nice(p) > 0 || idleprio_task(p)) { - rq->nice_pc += pc; - if (rq->nice_pc >= 128) { - cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128; - rq->nice_pc %= 128; + rq->nice_ns += ns; + if (rq->nice_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->nice_ns); + cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks; + rq->nice_ns %= JIFFY_NS; } } else { - rq->user_pc += pc; - if (rq->user_pc >= 128) { - cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128; - rq->user_pc %= 128; + rq->user_ns += ns; + if (rq->user_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->user_ns); + cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks; + rq->user_ns %= JIFFY_NS; } } acct_update_integrals(p); } /* - * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast - * shifts instead of 100 - */ -#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) - -/* * This is called on clock ticks. * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. @@ -3197,21 +3198,17 @@ update_cpu_clock_tick(struct rq *rq, struct task_struct *p) { s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; if (steal_account_process_tick()) goto ts_account; - account_pc = NS_TO_PC(account_ns); - /* Accurate tick timekeeping */ if (user_mode(get_irq_regs())) - pc_user_time(rq, p, account_pc, account_ns); + pc_user_time(rq, p, account_ns); else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { - pc_system_time(rq, p, HARDIRQ_OFFSET, - account_pc, account_ns); + pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); } else - pc_idle_time(rq, idle, account_pc); + pc_idle_time(rq, idle, account_ns); if (sched_clock_irqtime) irqtime_account_hi_si(); @@ -3234,15 +3231,12 @@ update_cpu_clock_switch(struct rq *rq, struct task_struct *p) { s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; - - account_pc = NS_TO_PC(account_ns); /* Accurate subtick timekeeping */ if (p != idle) - pc_user_time(rq, p, account_pc, account_ns); + pc_user_time(rq, p, account_ns); else - pc_idle_time(rq, idle, account_pc); + pc_idle_time(rq, idle, account_ns); /* time_slice accounting is done in usecs to avoid overflow on 32bit */ if (p->policy != SCHED_FIFO && p != idle) @@ -7693,8 +7687,8 @@ void __init sched_init(void) raw_spin_lock_init(&rq->lock); rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; rq->last_jiffy = jiffies; - rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = - rq->iowait_pc = rq->idle_pc = 0; + rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = + rq->iowait_ns = rq->idle_ns = 0; rq->dither = 0; set_rq_task(rq, &init_task); rq->iso_ticks = 0; diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h index 3af2ad2..4e3115d 100644 --- a/kernel/sched/MuQSS.h +++ b/kernel/sched/MuQSS.h @@ -40,8 +40,8 @@ struct rq { int rq_smt_bias; /* Policy/nice level bias across smt siblings */ #endif /* Accurate timekeeping data */ - unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, - iowait_pc, idle_pc; + unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, + iowait_ns, idle_ns; atomic_t nr_iowait; skiplist_node node; -- 2.7.4