diff -Naurp linux-2.6.0-test4-mm5-V10/arch/i386/kernel/smpboot.c linux-2.6.0-test4-mm5-O20/arch/i386/kernel/smpboot.c --- linux-2.6.0-test4-mm5-V10/arch/i386/kernel/smpboot.c 2003-09-04 22:21:37.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/arch/i386/kernel/smpboot.c 2003-09-04 22:37:28.000000000 +1000 @@ -915,13 +915,13 @@ static void smp_tune_scheduling (void) cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); printk("task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); + cache_decay_ticks); } /* diff -Naurp linux-2.6.0-test4-mm5-V10/arch/i386/kernel/timers/timer_tsc.c linux-2.6.0-test4-mm5-O20/arch/i386/kernel/timers/timer_tsc.c --- linux-2.6.0-test4-mm5-V10/arch/i386/kernel/timers/timer_tsc.c 2003-09-04 22:22:55.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/arch/i386/kernel/timers/timer_tsc.c 2003-09-04 22:38:45.000000000 +1000 @@ -125,6 +125,30 @@ static unsigned long long monotonic_cloc return base + cycles_2_ns(this_offset - last_offset); } +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * In the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (unlikely(!cpu_has_tsc)) +#endif + return (unsigned long long)jiffies * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + + static void mark_offset_tsc(void) { unsigned long lost,delay; diff -Naurp linux-2.6.0-test4-mm5-V10/arch/ia64/kernel/time.c linux-2.6.0-test4-mm5-O20/arch/ia64/kernel/time.c --- linux-2.6.0-test4-mm5-V10/arch/ia64/kernel/time.c 2003-09-04 22:21:37.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/arch/ia64/kernel/time.c 2003-09-04 22:37:28.000000000 +1000 @@ -61,6 +61,14 @@ do_profile (unsigned long ip) atomic_inc((atomic_t *) &prof_buffer[ip]); } +unsigned long long +sched_clock (void) +{ + unsigned long offset = ia64_get_itc(); + + return (offset * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT; +} + static void itc_reset (void) { diff -Naurp linux-2.6.0-test4-mm5-V10/arch/ppc/kernel/time.c linux-2.6.0-test4-mm5-O20/arch/ppc/kernel/time.c --- linux-2.6.0-test4-mm5-V10/arch/ppc/kernel/time.c 2003-09-04 22:21:36.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/arch/ppc/kernel/time.c 2003-09-04 22:38:00.000000000 +1000 @@ -83,6 +83,7 @@ time_t last_rtc_update; unsigned tb_ticks_per_jiffy; unsigned tb_to_us; unsigned tb_last_stamp; +unsigned long tb_to_ns_scale; extern unsigned long wall_jiffies; @@ -309,6 +310,7 @@ void __init time_init(void) tb_to_us = 0x418937; } else { ppc_md.calibrate_decr(); + tb_to_ns_scale = mulhwu(tb_to_us, 1000 << 10); } /* Now that the decrementer is calibrated, it can be used in case the @@ -432,3 +434,26 @@ unsigned mulhwu_scale_factor(unsigned in return mlt; } +unsigned long long sched_clock(void) +{ + unsigned long lo, hi, hi2; + unsigned long long tb; + + if (!__USE_RTC()) { + do { + hi = get_tbu(); + lo = get_tbl(); + hi2 = get_tbu(); + } while (hi2 != hi); + tb = ((unsigned long long) hi << 32) | lo; + tb = (tb * tb_to_ns_scale) >> 10; + } else { + do { + hi = get_rtcu(); + lo = get_rtcl(); + hi2 = get_rtcu(); + } while (hi2 != hi); + tb = ((unsigned long long) hi) * 1000000000 + lo; + } + return tb; +} diff -Naurp linux-2.6.0-test4-mm5-V10/arch/ppc64/kernel/time.c linux-2.6.0-test4-mm5-O20/arch/ppc64/kernel/time.c --- linux-2.6.0-test4-mm5-V10/arch/ppc64/kernel/time.c 2003-09-04 22:21:39.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/arch/ppc64/kernel/time.c 2003-09-04 22:38:05.000000000 +1000 @@ -307,6 +307,15 @@ int timer_interrupt(struct pt_regs * reg return 1; } +/* + * Scheduler clock - returns current time in nanosec units. + * + * This is wrong, but my CPUs run at 1GHz, so nyer nyer. + */ +unsigned long long sched_clock(void) +{ + return get_tb(); +} /* * This version of gettimeofday has microsecond resolution. diff -Naurp linux-2.6.0-test4-mm5-V10/arch/sparc64/kernel/time.c linux-2.6.0-test4-mm5-O20/arch/sparc64/kernel/time.c --- linux-2.6.0-test4-mm5-V10/arch/sparc64/kernel/time.c 2003-09-04 22:21:40.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/arch/sparc64/kernel/time.c 2003-09-04 22:38:10.000000000 +1000 @@ -416,6 +416,7 @@ unsigned long timer_tick_offset; unsigned long timer_tick_compare; static unsigned long timer_ticks_per_usec_quotient; +static unsigned long timer_ticks_per_nsec_quotient; #define TICK_SIZE (tick_nsec / 1000) @@ -1051,12 +1052,18 @@ static struct notifier_block sparc64_cpu #endif /* The quotient formula is taken from the IA64 port. */ +#define SPARC64_USEC_PER_CYC_SHIFT 30UL +#define SPARC64_NSEC_PER_CYC_SHIFT 30UL void __init time_init(void) { unsigned long clock = sparc64_init_timers(timer_interrupt); timer_ticks_per_usec_quotient = - (((1000000UL << 30) + + (((1000000UL << SPARC64_USEC_PER_CYC_SHIFT) + + (clock / 2)) / clock); + + timer_ticks_per_nsec_quotient = + (((NSEC_PER_SEC << SPARC64_NSEC_PER_CYC_SHIFT) + (clock / 2)) / clock); #ifdef CONFIG_CPU_FREQ @@ -1072,7 +1079,16 @@ static __inline__ unsigned long do_getti ticks += timer_tick_offset; ticks -= timer_tick_compare; - return (ticks * timer_ticks_per_usec_quotient) >> 30UL; + return (ticks * timer_ticks_per_usec_quotient) + >> SPARC64_USEC_PER_CYC_SHIFT; +} + +unsigned long long sched_clock(void) +{ + unsigned long ticks = tick_ops->get_tick(); + + return (ticks * timer_ticks_per_nsec_quotient) + >> SPARC64_NSEC_PER_CYC_SHIFT; } int do_settimeofday(struct timespec *tv) diff -Naurp linux-2.6.0-test4-mm5-V10/arch/x86_64/kernel/time.c linux-2.6.0-test4-mm5-O20/arch/x86_64/kernel/time.c --- linux-2.6.0-test4-mm5-V10/arch/x86_64/kernel/time.c 2003-09-04 22:22:56.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/arch/x86_64/kernel/time.c 2003-09-04 22:38:24.000000000 +1000 @@ -370,6 +370,19 @@ static irqreturn_t timer_interrupt(int i return IRQ_HANDLED; } +/* RED-PEN: calculation is done in 32bits with multiply for performance + and could overflow, it may be better (but slower)to use an 64bit division. */ +unsigned long long sched_clock(void) +{ + unsigned long a; + + if (__vxtime.mode == VXTIME_HPET) + return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; + + rdtscll(a); + return (a * vxtime.tsc_quot) >> 32; +} + unsigned long get_cmos_time(void) { unsigned int timeout, year, mon, day, hour, min, sec; diff -Naurp linux-2.6.0-test4-mm5-V10/fs/proc/array.c linux-2.6.0-test4-mm5-O20/fs/proc/array.c --- linux-2.6.0-test4-mm5-V10/fs/proc/array.c 2003-09-04 22:21:33.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/fs/proc/array.c 2003-09-04 22:38:32.000000000 +1000 @@ -154,13 +154,16 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), p->tgid, + get_task_state(p), + (p->sleep_avg/1024)*100/(1000000000/1024), + p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, p->uid, p->euid, p->suid, p->fsuid, diff -Naurp linux-2.6.0-test4-mm5-V10/include/asm-ppc/time.h linux-2.6.0-test4-mm5-O20/include/asm-ppc/time.h --- linux-2.6.0-test4-mm5-V10/include/asm-ppc/time.h 2003-09-04 22:21:55.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/include/asm-ppc/time.h 2003-09-04 22:38:00.000000000 +1000 @@ -97,6 +97,13 @@ extern __inline__ unsigned long get_rtcl return rtcl; } +extern __inline__ unsigned long get_rtcu(void) +{ + unsigned long rtcu; + asm volatile("mfrtcu %0" : "=r" (rtcu)); + return rtcu; +} + extern __inline__ unsigned get_native_tbl(void) { if (__USE_RTC()) return get_rtcl(); @@ -140,6 +147,7 @@ extern __inline__ unsigned binary_tbl(vo #endif /* Use mulhwu to scale processor timebase to timeval */ +/* Specifically, this computes (x * y) / 2^32. -- paulus */ #define mulhwu(x,y) \ ({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;}) diff -Naurp linux-2.6.0-test4-mm5-V10/include/linux/sched.h linux-2.6.0-test4-mm5-O20/include/linux/sched.h --- linux-2.6.0-test4-mm5-V10/include/linux/sched.h 2003-09-04 22:23:00.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/include/linux/sched.h 2003-09-04 22:39:35.000000000 +1000 @@ -342,17 +342,14 @@ struct task_struct { struct list_head run_list; prio_array_t *array; - /* Scheduler variables follow. kernel/sched.c */ - unsigned long array_sequence; - unsigned long timestamp; - - unsigned long total_time, sleep_time; unsigned long sleep_avg; + long interactive_credit; + unsigned long long timestamp; + int activated; unsigned long policy; cpumask_t cpus_allowed; unsigned int time_slice, first_time_slice; - unsigned int used_slice; struct list_head tasks; struct list_head ptrace_children; @@ -519,6 +516,8 @@ static inline int set_cpus_allowed(task_ } #endif +extern unsigned long long sched_clock(void); + #ifdef CONFIG_NUMA extern void sched_balance_exec(void); extern void node_nr_running_init(void); @@ -573,7 +572,6 @@ extern int FASTCALL(wake_up_state(struct extern int FASTCALL(wake_up_process(struct task_struct * tsk)); extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk)); extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); -extern void FASTCALL(sched_fork(task_t * p)); extern void FASTCALL(sched_exit(task_t * p)); asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); diff -Naurp linux-2.6.0-test4-mm5-V10/kernel/fork.c linux-2.6.0-test4-mm5-O20/kernel/fork.c --- linux-2.6.0-test4-mm5-V10/kernel/fork.c 2003-09-04 22:23:00.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/kernel/fork.c 2003-09-04 22:37:28.000000000 +1000 @@ -923,9 +923,33 @@ struct task_struct *copy_process(unsigne p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; - /* Perform scheduler related accounting */ - sched_fork(p); - + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->first_time_slice = 1; + current->time_slice >>= 1; + p->timestamp = sched_clock(); + if (!current->time_slice) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + preempt_disable(); + scheduler_tick(0, 0); + local_irq_enable(); + preempt_enable(); + } else + local_irq_enable(); /* * Ok, add it to the run-queues and make it * visible to the rest of the system. diff -Naurp linux-2.6.0-test4-mm5-V10/kernel/sched.c linux-2.6.0-test4-mm5-O20/kernel/sched.c --- linux-2.6.0-test4-mm5-V10/kernel/sched.c 2003-09-04 22:23:00.000000000 +1000 +++ linux-2.6.0-test4-mm5-O20/kernel/sched.c 2003-09-04 22:41:07.000000000 +1000 @@ -58,52 +58,119 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ + (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) /* - * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there - * is a maximum priority process runnable. MAX_TIMESLICE is derived from the - * formula in task_timeslice. It cannot be changed here. It is the timesilce - * that the maximum priority process will get. Larger timeslices are attainable - * by low priority processes however. + * Some helpers for converting nanosecond timing to jiffy resolution */ -#define MIN_TIMESLICE ((1 * HZ / 1000) ? 1 * HZ / 1000 : 1) -#define MAX_TIMESLICE (40 * MIN_TIMESLICE) /* do not change */ - -/* Maximum amount of history that will be used to calculate priority */ -#define MAX_SLEEP (HZ) +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) /* - * Maximum affect that 1 block of activity (run/sleep/etc) can have. This is - * will moderate dicard freak events (eg. SIGSTOP) - */ -#define MAX_SLEEP_AFFECT (MAX_SLEEP/2) + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, + * maximum timeslice is 200 msecs. Timeslices get refilled after + * they expire. + */ +#define MIN_TIMESLICE ( 10 * HZ / 1000) +#define MAX_TIMESLICE (200 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +#define NODE_THRESHOLD 125 +#define CREDIT_LIMIT 100 /* - * The amount of history can be decreased (on fork for example). This puts a - * lower bound on it. + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. */ -#define MIN_HISTORY (HZ/20) -/* - * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. - * In particular: total_time, sleep_time, sleep_avg. - */ -#define SLEEP_FACTOR (1024) +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) -#define NODE_THRESHOLD 125 +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) \ + (MIN_TIMESLICE * (1 << (MAX_BONUS - CURRENT_BONUS(p))) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) \ + (MIN_TIMESLICE * (1 << (MAX_BONUS - CURRENT_BONUS(p)))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define JUST_INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define HIGH_CREDIT(p) \ + ((p)->interactive_credit > CREDIT_LIMIT) + +#define LOW_CREDIT(p) \ + ((p)->interactive_credit < -CREDIT_LIMIT) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) /* - * The scheduler classifies a process as performing one of the following - * activities + * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] + * to time slice values. + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + * + * task_timeslice() is the interface that is used by the scheduler. */ -#define STIME_SLEEP 1 /* Sleeping */ -#define STIME_RUN 2 /* Using CPU */ -#define STIME_WAIT 3 /* Waiting to be run */ - -#define TASK_PREEMPTS_CURR(p, rq) \ - ( (p)->prio < (rq)->curr->prio \ - || ((p)->prio == (rq)->curr->prio \ - && (p)->static_prio < (rq)->curr->static_prio) ) + +#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1))) + +static inline unsigned int task_timeslice(task_t *p) +{ + return BASE_TIMESLICE(p); +} /* * These are the runqueue data structures: @@ -128,8 +195,7 @@ struct prio_array { */ struct runqueue { spinlock_t lock; - unsigned long array_sequence; - unsigned long nr_running, nr_switches, + unsigned long nr_running, nr_switches, expired_timestamp, nr_uninterruptible; task_t *curr, *idle; struct mm_struct *prev_mm; @@ -270,94 +336,33 @@ static inline void enqueue_task(struct t } /* - * add_task_time updates a task @p after @time of doing the specified @type - * of activity. See STIME_*. This is used for priority calculation. - */ -static void add_task_time(task_t *p, unsigned long time, unsigned long type) -{ - unsigned long r; - - if (time == 0) - return; - - if (time > MAX_SLEEP_AFFECT) - time = MAX_SLEEP_AFFECT; - - r = MAX_SLEEP - time; - p->total_time = (r*p->total_time + MAX_SLEEP/2) / MAX_SLEEP; - p->sleep_time = (r*p->sleep_time + MAX_SLEEP/2) / MAX_SLEEP; - - if (type != STIME_WAIT) { - p->total_time += SLEEP_FACTOR * time; - if (type == STIME_SLEEP) - p->sleep_time += SLEEP_FACTOR * time; - - p->sleep_avg = (SLEEP_FACTOR * p->sleep_time) / p->total_time; - } - - if (p->total_time < SLEEP_FACTOR * MIN_HISTORY) { - p->total_time = SLEEP_FACTOR * MIN_HISTORY; - p->sleep_time = p->total_time * p->sleep_avg / SLEEP_FACTOR; - } -} - -/* - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. * - * Timeslices are scaled, so if only low priority processes are running, - * they will all get long timeslices. - */ -static unsigned int task_timeslice(task_t *p, runqueue_t *rq) -{ - int idx, delta; - unsigned int base, timeslice; - - if (rt_task(p)) - return MAX_TIMESLICE; - - idx = min(find_next_bit(rq->active->bitmap, MAX_PRIO, MAX_RT_PRIO), - find_next_bit(rq->expired->bitmap, MAX_PRIO, MAX_RT_PRIO)); - idx = min(idx, p->prio); - delta = p->prio - idx; - - /* - * This is a bit subtle. The first line establishes a timeslice based - * on how far this task is from being the highest priority runnable. - * The second line scales this result so low priority tasks will get - * big timeslices if higher priority ones are not running. - */ - base = MIN_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); - timeslice = base * (USER_PRIO(idx) + 8) / 8; - - if (timeslice <= 0) - timeslice = 1; - - return timeslice; -} - -/* - * task_priority: calculates a task's priority based on previous running - * history (see add_task_time). The priority is just a simple linear function - * based on sleep_avg and static_prio. + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. */ -static unsigned long task_priority(task_t *p) +static int effective_prio(task_t *p) { int bonus, prio; if (rt_task(p)) return p->prio; - bonus = (MAX_USER_PRIO * p->sleep_avg) / SLEEP_FACTOR / 2; - prio = USER_PRIO(p->static_prio) / 2 + (MAX_USER_PRIO / 2); + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - prio = MAX_RT_PRIO + prio - bonus; + prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) prio = MAX_PRIO-1; - return prio; } @@ -370,6 +375,82 @@ static inline void __activate_task(task_ nr_running_inc(rq); } +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->activated != -1 && + sleep_time > JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + AVG_TIMESLICE); + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks with low interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + if (LOW_CREDIT(p) && + sleep_time > JIFFIES_TO_NS(task_timeslice(p))) + sleep_time = + JIFFIES_TO_NS(task_timeslice(p)); + + /* + * Non high_credit tasks waking from uninterruptible + * sleep are limited in their sleep_avg rise as they + * are likely to be cpu hogs waiting on I/O + */ + if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){ + if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = + JUST_INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > NS_MAX_SLEEP_AVG){ + p->sleep_avg = NS_MAX_SLEEP_AVG; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } + } + } + + p->prio = effective_prio(p); +} + /* * activate_task - move a task to the runqueue and do priority recalculation * @@ -378,36 +459,32 @@ static inline void __activate_task(task_ */ static inline void activate_task(task_t *p, runqueue_t *rq) { - unsigned long sleep = jiffies - p->timestamp; - p->timestamp = jiffies; + unsigned long long now = sched_clock(); - if (sleep > MAX_SLEEP) - sleep = MAX_SLEEP; + recalc_task_prio(p, now); /* - * If woken by a userspace task, it is assumed the waker has done - * some work for us, so share some priority with it + * This checks to make sure it's not an uninterruptible task + * that is now waking up. */ - if (!in_interrupt() && current->mm) { - unsigned long boost = sleep / 2; - add_task_time(current, boost, STIME_SLEEP); - add_task_time(p, sleep - boost, STIME_SLEEP); - } else { - add_task_time(p, sleep, STIME_SLEEP); - + if (!p->activated){ + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ if (in_interrupt()) - add_task_time(p, sleep / 2, STIME_SLEEP); - } - - p->prio = task_priority(p); - - /* - * If we have slept through an active/expired array switch, restart - * our timeslice too. - */ - if (rq->array_sequence != p->array_sequence) { - p->used_slice = 0; - } + p->activated = 2; + else + /* + * Normal first-time wakeups get a credit too for on-runqueue + * time, but it will be weighted down: + */ + p->activated = 1; + } + p->timestamp = now; __activate_task(p, rq); } @@ -417,7 +494,6 @@ static inline void activate_task(task_t */ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { - p->array_sequence = rq->array_sequence; nr_running_dec(rq); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; @@ -461,7 +537,7 @@ static inline void resched_task(task_t * * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. - n*/ + */ void wait_task_inactive(task_t * p) { unsigned long flags; @@ -530,10 +606,18 @@ repeat_lock_task: task_rq_unlock(rq, &flags); goto repeat_lock_task; } - if (old_state == TASK_UNINTERRUPTIBLE) + if (old_state == TASK_UNINTERRUPTIBLE){ rq->nr_uninterruptible--; - activate_task(p, rq); - if (!sync) { + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + if (sync) + __activate_task(p, rq); + else { + activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } @@ -567,95 +651,42 @@ int wake_up_state(task_t *p, unsigned in } /* - * Perform scheduler related accounting for a newly forked process @p. - * @p is forked by current. - */ -void sched_fork(task_t *p) -{ - unsigned long ts; - unsigned long flags; - runqueue_t *rq; - - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); - p->timestamp = jiffies; - rq = task_rq_lock(current, &flags); - ts = task_timeslice(current, rq); - task_rq_unlock(rq, &flags); - - /* - * Share half our timeslice with the child. - */ - p->used_slice = current->used_slice + (ts - current->used_slice) / 2; - current->used_slice += (ts - current->used_slice + 1) / 2; - - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - if (current->used_slice >= ts) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->used_slice = ts - 1; - preempt_disable(); - scheduler_tick(0, 0); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); -} - -/* * wake_up_forked_process - wake up a freshly forked process. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created process. */ -void wake_up_forked_process(task_t *p) +void wake_up_forked_process(task_t * p) { unsigned long flags; runqueue_t *rq = task_rq_lock(current, &flags); p->state = TASK_RUNNING; - - set_task_cpu(p, smp_processor_id()); - /* - * Get only 1/10th of the parents history, but at a much higher - * priority. Limited by MIN_HISTORY. + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. */ - p->total_time = current->total_time / 10; - p->sleep_time = current->sleep_time / 10 - + (current->total_time - current->sleep_time) / 20; - if (p->total_time != 0) - p->sleep_avg = (SLEEP_FACTOR * p->sleep_time) / p->total_time; - else - p->sleep_avg = SLEEP_FACTOR / 2; + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - if (p->total_time < SLEEP_FACTOR * MIN_HISTORY) { - p->total_time = SLEEP_FACTOR * MIN_HISTORY; - p->sleep_time = p->total_time * p->sleep_avg / SLEEP_FACTOR; - } + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - /* - * Lose 1/4 sleep_time for forking. - */ - current->sleep_time = 3 * current->sleep_time / 4; - if (current->total_time != 0) - current->sleep_avg = (SLEEP_FACTOR * current->sleep_time) - / current->total_time; + p->interactive_credit = 0; - p->prio = task_priority(p); - __activate_task(p, rq); + p->prio = effective_prio(p); + set_task_cpu(p, smp_processor_id()); + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + nr_running_inc(rq); + } task_rq_unlock(rq, &flags); } @@ -673,24 +704,20 @@ void sched_exit(task_t * p) unsigned long flags; local_irq_save(flags); - - /* Regain the unused timeslice given to @p by its parent */ if (p->first_time_slice) { - unsigned long flags; - runqueue_t *rq; - rq = task_rq_lock(p, &flags); - p->parent->used_slice -= task_timeslice(p, rq) - p->used_slice; - task_rq_unlock(rq, &flags); + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) + p->parent->time_slice = MAX_TIMESLICE; } - - /* Apply some penalty to @p's parent if @p used a lot of CPU */ - if (p->sleep_avg < p->parent->sleep_avg) { - add_task_time(p->parent, - (p->parent->sleep_avg - p->sleep_avg)/2, - STIME_RUN); - } - local_irq_restore(flags); + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); } /** @@ -1097,34 +1124,25 @@ static inline void pull_task(runqueue_t } /* - * can_migrate_task - * May task @p from runqueue @rq be migrated to @this_cpu? - * @idle: Is this_cpu idle - * Returns: 1 if @p may be migrated, 0 otherwise. + * Previously: + * + * #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + * ((!idle || (NS_TO_JIFFIES(now - (p)->timestamp) > \ + * cache_decay_ticks)) && !task_running(rq, p) && \ + * cpu_isset(this_cpu, (p)->cpus_allowed)) */ + static inline int -can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, int idle) +can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) { - unsigned long delta; + unsigned long delta = sched_clock() - tsk->timestamp; - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - - if (task_running(rq, p)) + if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) return 0; - - if (!cpu_isset(this_cpu, p->cpus_allowed)) + if (task_running(rq, tsk)) return 0; - - /* Aggressive migration if we're idle */ - delta = jiffies - p->timestamp; - if (!idle && (delta <= cache_decay_ticks)) + if (!cpu_isset(this_cpu, tsk->cpus_allowed)) return 0; - return 1; } @@ -1186,6 +1204,13 @@ skip_bitmap: skip_queue: tmp = list_entry(curr, task_t, run_list); + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + curr = curr->prev; if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { @@ -1195,8 +1220,6 @@ skip_queue: goto skip_bitmap; } pull_task(busiest, array, tmp, this_rq, this_cpu); - - /* Only migrate 1 task if we're idle */ if (!idle && --imbalance) { if (curr != head) goto skip_queue; @@ -1265,16 +1288,17 @@ static void rebalance_tick(runqueue_t *t load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } - return; - } + + } else { #ifdef CONFIG_NUMA - if (!(j % BUSY_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); + if (!(j % BUSY_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); #endif - if (!(j % BUSY_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + if (!(j % BUSY_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } } } #else @@ -1291,6 +1315,20 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); /* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks: + */ +#define EXPIRED_STARVING(rq) \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -1307,11 +1345,17 @@ void scheduler_tick(int user_ticks, int if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); + /* note: this timer irq context must be accounted for as well */ + if (hardirq_count() - HARDIRQ_OFFSET) { + cpustat->irq += sys_ticks; + sys_ticks = 0; + } else if (softirq_count()) { + cpustat->softirq += sys_ticks; + sys_ticks = 0; + } + if (p == rq->idle) { - /* note: this timer irq context must be accounted for as well */ - if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET) - cpustat->system += sys_ticks; - else if (atomic_read(&rq->nr_iowait) > 0) + if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; @@ -1334,37 +1378,65 @@ void scheduler_tick(int user_ticks, int * The task was running during this tick - update the * time slice counter. Note: we do not update a thread's * priority until it either goes to sleep or uses up its - * timeslice. + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. */ - if (p->policy == SCHED_RR) { - p->used_slice++; - if (p->used_slice >= task_timeslice(p, rq)) { - p->used_slice = 0; - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); - } + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); } goto out_unlock; } - - p->used_slice++; - if (p->used_slice >= task_timeslice(p, rq)) { + if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); - p->prio = task_priority(p); - p->used_slice = 0; + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); p->first_time_slice = 0; - enqueue_task(p, rq->expired); + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->expired); + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least MIN_TIMESLICE to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= (MIN_TIMESLICE * 2)) && + (p->array == rq->active)) { + + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1383,7 +1455,7 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; - unsigned long now; + unsigned long long now; unsigned long run_time; int idx; @@ -1405,10 +1477,19 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - now = jiffies; - run_time = now - prev->timestamp; + now = sched_clock(); + if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = NS_MAX_SLEEP_AVG; - add_task_time(prev, run_time, STIME_RUN); + /* + * Tasks with interactive credits get charged less run_time + * at high sleep_avg to delay them losing their interactive + * status + */ + if (HIGH_CREDIT(prev)) + run_time /= (CURRENT_BONUS(prev) ? : 1); spin_lock_irq(&rq->lock); @@ -1440,6 +1521,7 @@ pick_next_task: goto pick_next_task; #endif next = rq->idle; + rq->expired_timestamp = 0; goto switch_tasks; } @@ -1448,24 +1530,42 @@ pick_next_task: /* * Switch the active and expired arrays. */ - rq->array_sequence++; rq->active = rq->expired; rq->expired = array; array = rq->active; + rq->expired_timestamp = 0; } idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (next->activated > 0) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0){ + prev->sleep_avg = 0; + if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) + prev->interactive_credit--; + } prev->timestamp = now; + if (likely(prev != next)) { - add_task_time(next, now - next->timestamp, STIME_WAIT); next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -2263,8 +2363,6 @@ asmlinkage long sys_sched_rr_get_interva int retval = -EINVAL; struct timespec t; task_t *p; - unsigned long flags; - runqueue_t *rq; if (pid < 0) goto out_nounlock; @@ -2279,10 +2377,8 @@ asmlinkage long sys_sched_rr_get_interva if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p, rq), &t); - task_rq_unlock(rq, &flags); + 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -2502,6 +2598,12 @@ static void move_task_away(struct task_s local_irq_restore(flags); } +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -2511,20 +2613,21 @@ static int migration_thread(void * data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - int cpu = (long) data; + migration_startup_t *startup = data; + int cpu = startup->cpu; runqueue_t *rq; int ret; + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - /* - * Either we are running on the right CPU, or there's a a - * migration thread on this CPU, guaranteed (we're started - * serially). - */ - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); @@ -2560,13 +2663,30 @@ static int migration_call(struct notifie unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + migration_startup_t startup; + switch (action) { case CPU_ONLINE: - printk("Starting migration thread for cpu %li\n", - (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) yield(); + break; } return NOTIFY_OK;