Index: linux-2.6.9-rc2-mm2/fs/proc/array.c =================================================================== --- linux-2.6.9-rc2-mm2.orig/fs/proc/array.c 2004-09-23 09:59:23.268850104 +1000 +++ linux-2.6.9-rc2-mm2/fs/proc/array.c 2004-09-23 10:00:41.383974800 +1000 @@ -162,6 +162,7 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" + "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -169,6 +170,7 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), + (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, @@ -468,25 +470,3 @@ int proc_pid_statm(struct task_struct *t return sprintf(buffer,"%d %d %d %d %d %d %d\n", size, resident, shared, text, lib, data, 0); } - -int task_cpu_sched_stats(struct task_struct *p, char *buffer) -{ - struct task_sched_stats stats; - unsigned long nvcsw, nivcsw; /* context switch counts */ - - read_lock(&tasklist_lock); - get_task_sched_stats(p, &stats); - nvcsw = p->nvcsw; - nivcsw = p-> nivcsw; - read_unlock(&tasklist_lock); - return sprintf(buffer, - "%llu %llu %llu %llu %llu %llu %lu %lu @ %llu\n", - stats.total_sleep, - stats.total_cpu, - stats.total_delay, - stats.total_sinbin, - stats.cycle_count, - stats.intr_wake_ups, - nvcsw, nivcsw, - stats.timestamp); -} Index: linux-2.6.9-rc2-mm2/fs/proc/base.c =================================================================== --- linux-2.6.9-rc2-mm2.orig/fs/proc/base.c 2004-09-23 09:59:23.269849952 +1000 +++ linux-2.6.9-rc2-mm2/fs/proc/base.c 2004-09-23 10:01:00.311097440 +1000 @@ -96,10 +96,6 @@ enum pid_directory_inos { #ifdef CONFIG_CPUSETS PROC_TID_CPUSET, #endif - PROC_TID_CPU_STATS, - PROC_TID_CPU_RATE_CAP, - PROC_TID_CPU_RATE_HARD_CAP, - PROC_TID_CPU_SHARES, #ifdef CONFIG_SECURITY PROC_TID_ATTR, PROC_TID_ATTR_CURRENT, @@ -174,10 +170,6 @@ static struct pid_entry tid_base_stuff[] #ifdef CONFIG_CPUSETS E(PROC_TID_CPUSET, "cpuset", S_IFREG|S_IRUGO), #endif - E(PROC_TID_CPU_STATS, "cpustats", S_IFREG|S_IRUGO), - E(PROC_TID_CPU_RATE_CAP, "cpu_rate_cap", S_IFREG|S_IRUGO|S_IWUSR), - E(PROC_TID_CPU_RATE_HARD_CAP, "cpu_rate_hard_cap", S_IFREG|S_IRUGO|S_IWUSR), - E(PROC_TID_CPU_SHARES, "cpu_shares", S_IFREG|S_IRUGO|S_IWUSR), {0,0,NULL,0} }; @@ -214,7 +206,6 @@ int proc_tid_stat(struct task_struct*,ch int proc_tgid_stat(struct task_struct*,char*); int proc_pid_status(struct task_struct*,char*); int proc_pid_statm(struct task_struct*,char*); -extern int task_cpu_sched_stats(struct task_struct *p, char *buffer); static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { @@ -586,142 +577,6 @@ static struct file_operations proc_info_ .read = proc_info_read, }; -static ssize_t cpu_rate_cap_read(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[64]; - size_t len; - unsigned long long hcppt = proportion_to_ppt(task->cpu_rate_cap); - - if (*ppos) - return 0; - *ppos = len = sprintf(buffer, "%llu\n", hcppt); - if (copy_to_user(buf, buffer, len)) - return -EFAULT; - - return len; -} - -static ssize_t cpu_rate_cap_write(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[128] = ""; - char *endptr = NULL; - unsigned long long hcppt; - int res; - - - if ((count > 63) || *ppos) - return -EFBIG; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - hcppt = simple_strtoul(buffer, &endptr, 0); - if ((endptr == buffer) || (hcppt == ULONG_MAX)) - return -EINVAL; - - if ((res = set_cpu_rate_cap(task, ppt_to_proportion(hcppt))) != 0) - return res; - - return count; -} - -static struct file_operations proc_cpu_rate_cap_operations = { - read: cpu_rate_cap_read, - write: cpu_rate_cap_write, -}; - -static ssize_t cpu_rate_hard_cap_read(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[64]; - size_t len; - unsigned long long hcppt = proportion_to_ppt(task->cpu_rate_hard_cap); - - if (*ppos) - return 0; - *ppos = len = sprintf(buffer, "%llu\n", hcppt); - if (copy_to_user(buf, buffer, len)) - return -EFAULT; - - return len; -} - -static ssize_t cpu_rate_hard_cap_write(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[128] = ""; - char *endptr = NULL; - unsigned long long hcppt; - int res; - - - if ((count > 63) || *ppos) - return -EFBIG; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - hcppt = simple_strtoul(buffer, &endptr, 0); - if ((endptr == buffer) || (hcppt == ULONG_MAX)) - return -EINVAL; - - if ((res = set_cpu_rate_hard_cap(task, ppt_to_proportion(hcppt))) != 0) - return res; - - return count; -} - -static struct file_operations proc_cpu_rate_hard_cap_operations = { - read: cpu_rate_hard_cap_read, - write: cpu_rate_hard_cap_write, -}; - -static ssize_t cpu_shares_read(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[64]; - size_t len; - - if (*ppos) - return 0; - *ppos = len = sprintf(buffer, "%u\n", task->eb_shares); - if (copy_to_user(buf, buffer, len)) - return -EFAULT; - - return len; -} - -static ssize_t cpu_shares_write(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[64] = ""; - char *endptr = NULL; - unsigned long shares; - int res; - - if ((count > 63) || *ppos) - return -EFBIG; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - shares = simple_strtoul(buffer, &endptr, 0); - if ((endptr == buffer) || (shares == ULONG_MAX)) - return -EINVAL; - - if ((res = set_cpu_shares(task, shares)) != 0) - return res; - - return count; -} - -static struct file_operations proc_cpu_shares_operations = { - read: cpu_shares_read, - write: cpu_shares_write, -}; - static int mem_open(struct inode* inode, struct file* file) { file->private_data = (void*)((long)current->self_exec_id); @@ -1540,19 +1395,6 @@ static struct dentry *proc_pident_lookup ei->op.proc_read = proc_pid_schedstat; break; #endif - case PROC_TID_CPU_STATS: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = task_cpu_sched_stats; - break; - case PROC_TID_CPU_RATE_CAP: - inode->i_fop = &proc_cpu_rate_cap_operations; - break; - case PROC_TID_CPU_RATE_HARD_CAP: - inode->i_fop = &proc_cpu_rate_hard_cap_operations; - break; - case PROC_TID_CPU_SHARES: - inode->i_fop = &proc_cpu_shares_operations; - break; #ifdef CONFIG_CPUSETS case PROC_TID_CPUSET: case PROC_TGID_CPUSET: Index: linux-2.6.9-rc2-mm2/fs/proc/proc_misc.c =================================================================== --- linux-2.6.9-rc2-mm2.orig/fs/proc/proc_misc.c 2004-09-23 09:59:23.270849800 +1000 +++ linux-2.6.9-rc2-mm2/fs/proc/proc_misc.c 2004-09-23 10:00:41.387974192 +1000 @@ -271,40 +271,6 @@ static struct file_operations proc_cpuin .release = seq_release, }; -static int cpustats_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int i; - int len = 0; - struct cpu_sched_stats total = {0, }; - - for_each_online_cpu(i) { - struct cpu_sched_stats stats; - - get_cpu_sched_stats(i, &stats); - len += sprintf(page + len, "cpu%02d %llu %llu %llu %llu %llu @ %llu\n", i, - stats.total_idle, - stats.total_busy, - stats.total_delay, - stats.total_sinbin, - stats.nr_switches, - stats.timestamp); - total.total_idle += stats.total_idle; - total.total_busy += stats.total_busy; - total.total_delay += stats.total_delay; - total.total_sinbin += stats.total_sinbin; - total.nr_switches += stats.nr_switches; - } - len += sprintf(page + len, "total %llu %llu %llu %llu %llu\n", - total.total_idle, - total.total_busy, - total.total_delay, - total.total_sinbin, - total.nr_switches); - - return proc_calc_metrics(page, start, off, count, eof, len); -} - extern struct seq_operations vmstat_op; static int vmstat_open(struct inode *inode, struct file *file) { @@ -660,7 +626,6 @@ void __init proc_misc_init(void) {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, - {"cpustats", cpustats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) Index: linux-2.6.9-rc2-mm2/include/linux/init_task.h =================================================================== --- linux-2.6.9-rc2-mm2.orig/include/linux/init_task.h 2004-09-23 09:59:23.544808152 +1000 +++ linux-2.6.9-rc2-mm2/include/linux/init_task.h 2004-09-23 10:00:41.388974040 +1000 @@ -68,19 +68,11 @@ extern struct group_info init_groups; { \ .state = 0, \ .thread_info = &init_thread_info, \ - .rq = NULL, \ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ - .pre_bonus_priority = MAX_PRIO-20, \ - .eb_shares = DEFAULT_EB_SHARES, \ - .cpu_rate_cap = PROPORTION_ONE, \ - .cpu_rate_hard_cap = PROPORTION_ONE, \ - .sinbin_timer = { \ - .function = sinbin_release_fn \ - }, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ Index: linux-2.6.9-rc2-mm2/include/linux/sched.h =================================================================== --- linux-2.6.9-rc2-mm2.orig/include/linux/sched.h 2004-09-23 09:59:23.562805416 +1000 +++ linux-2.6.9-rc2-mm2/include/linux/sched.h 2004-09-23 10:01:40.272022456 +1000 @@ -360,7 +360,7 @@ extern struct user_struct *find_user(uid extern struct user_struct root_user; #define INIT_USER (&root_user) -typedef struct runqueue runqueue_t; +typedef struct prio_array prio_array_t; struct backing_dev_info; struct reclaim_state; @@ -590,30 +590,6 @@ int set_current_groups(struct group_info struct audit_context; /* See audit.c */ struct mempolicy; -/* - * For entitlemnet based scheduling a task's shares will be determined from - * their "nice"ness - */ -#define EB_SHARES_PER_NICE 5 -#define DEFAULT_EB_SHARES (20 * EB_SHARES_PER_NICE) -#define MAX_EB_SHARES (DEFAULT_EB_SHARES * DEFAULT_EB_SHARES) -/* - * CPU usage rate is estimated as a proportion of a CPU using fixed denominator - * rational numbers. The denominator must be less than or equal to 2^32 - */ -#define PROPORTION_OFFSET 24 -#define PROPORTION_ONE (1ULL << PROPORTION_OFFSET) -static inline unsigned long long proportion_to_ppt(unsigned long long proportion) -{ - return (proportion * 1000) >> PROPORTION_OFFSET; -} -unsigned long long ppt_to_proportion(unsigned long long ppt); -int set_cpu_rate_cap(struct task_struct *p, unsigned long long new_cap); -int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long long new_cap); -int set_cpu_shares(struct task_struct *p, unsigned int new_shares); - -void sinbin_release_fn(unsigned long arg); - struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -625,28 +601,16 @@ struct task_struct { int prio, static_prio; struct list_head run_list; - runqueue_t *rq; + prio_array_t *array; + unsigned long sleep_avg; + long interactive_credit; unsigned long long timestamp; - - unsigned long long sched_timestamp; - unsigned long long avg_sleep_per_cycle; - unsigned long long avg_delay_per_cycle; - unsigned long long avg_cpu_per_cycle; - unsigned long interactive_bonus, throughput_bonus; - unsigned long long cycle_count, total_sleep, total_cpu, total_delay; - unsigned long long sleepiness, cpu_usage_rate; - unsigned int pre_bonus_priority; - unsigned int eb_shares; - unsigned long long intr_wake_ups; - unsigned long long cpu_rate_cap; - unsigned long long cpu_rate_hard_cap; - unsigned long long total_sinbin; - struct timer_list sinbin_timer; + int activated; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice; + unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -826,50 +790,8 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ -#define PF_UISLEEP 0x00400000 /* Uninterruptible sleep */ -#define PF_SINBINNED 0x00800000 /* I am sinbinned */ -#define PF_UNPRIV_RT 0x01000000 /* I wanted to be RT but had insufficient privilege*/ #define PF_BORROWED_MM 0x02000000 /* I am a kthread doing use_mm */ -/* - * Scheduling statistics for a task/thread - */ -struct task_sched_stats { - unsigned long long timestamp; - unsigned long long cycle_count; - unsigned long long total_sleep; - unsigned long long total_cpu; - unsigned long long total_delay; - unsigned long long total_sinbin; - unsigned long long intr_wake_ups; -}; - -/* - * Get "up to date" scheduling statistics for the given task - * This function should be used if reliable scheduling statistitcs are required - * outside the scheduler itself as the relevant fields in the task structure - * are not "up to date" NB the possible difference between those in the task - * structure and the correct values could be quite large for sleeping tasks. - */ -extern void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats); - -/* - * Scheduling statistics for a CPU - */ -struct cpu_sched_stats { - unsigned long long timestamp; - unsigned long long total_idle; - unsigned long long total_busy; - unsigned long long total_delay; - unsigned long long total_sinbin; - unsigned long long nr_switches; -}; - -/* - * Get scheduling statistics for the nominated CPU - */ -extern void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats); - #ifdef CONFIG_SMP extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else @@ -1252,7 +1174,10 @@ static inline unsigned int task_cpu(cons return p->thread_info->cpu; } -void set_task_cpu(struct task_struct *p, unsigned int cpu); +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->thread_info->cpu = cpu; +} #else Index: linux-2.6.9-rc2-mm2/include/linux/sysctl.h =================================================================== --- linux-2.6.9-rc2-mm2.orig/include/linux/sysctl.h 2004-09-23 09:59:23.566804808 +1000 +++ linux-2.6.9-rc2-mm2/include/linux/sysctl.h 2004-09-23 10:00:41.389973888 +1000 @@ -134,7 +134,6 @@ enum KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */ KERN_HZ_TIMER=65, /* int: hz timer on or off */ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ - KERN_CPU_SCHED=67, /* CPU scheduler stuff */ }; Index: linux-2.6.9-rc2-mm2/kernel/sched.c =================================================================== --- linux-2.6.9-rc2-mm2.orig/kernel/sched.c 2004-09-23 09:59:23.607798576 +1000 +++ linux-2.6.9-rc2-mm2/kernel/sched.c 2004-09-23 10:00:41.396972824 +1000 @@ -16,12 +16,6 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2004-06-03 Single priority array, simplified interactive bonus - * mechanism, throughput bonus mechanism, hard and soft - * caps and entitlement based mode by Peter Williams - * (Courtesy of Aurema Pty Ltd, www.aurema.com) - * 2004-08-19 Unprivileged RT mode tasks (based on Con Kolivas's - * SCHED_FIFO scheduler class) by Peter Williams */ #include @@ -52,25 +46,14 @@ #include #include #include -#include - #include #include -enum sched_mode_enum { - SCHED_MODE_PRIORITY_BASED, - SCHED_MODE_ENTITLEMENT_BASED -}; - -static enum sched_mode_enum sched_mode = SCHED_MODE_PRIORITY_BASED; - -#ifdef CONFIG_SYSCTL -static const char *sched_mode_names[] = { - "pb", /* SCHED_MODE_PRIORITY_BASED */ - "eb", /* SCHED_MODE_ENTITLEMENT_BASED */ - NULL /* end of list marker */ -}; +#ifdef CONFIG_NUMA +#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) +#else +#define cpu_to_node_mask(cpu) (cpu_online_map) #endif /* @@ -92,267 +75,128 @@ static const char *sched_mode_names[] = #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) /* - * These are the 'tuning knobs' of the scheduler: - * Making IDLE_PRIO bigger than 159 would require modification of bitmaps - */ -#define IDLE_PRIO 159 -#define BGND_PRIO (IDLE_PRIO - 1) -#define MIN_NORMAL_PRIO (MAX_RT_PRIO + 1) -#define MAX_TOTAL_BONUS (BGND_PRIO - MAX_PRIO - 1) -#define MAX_MAX_IA_BONUS ((MAX_TOTAL_BONUS + 1) / 2) -#define MAX_MAX_TPT_BONUS (MAX_TOTAL_BONUS - MAX_MAX_IA_BONUS) -#define DEFAULT_MAX_IA_BONUS ((MAX_MAX_IA_BONUS < 7) ? MAX_MAX_IA_BONUS : 7) -#define DEFAULT_MAX_TPT_BONUS ((DEFAULT_MAX_IA_BONUS - 2) ? : 1) -static unsigned int max_ia_bonus = DEFAULT_MAX_IA_BONUS; -static unsigned int initial_ia_bonus = 1; -static unsigned int max_tpt_bonus = DEFAULT_MAX_TPT_BONUS; - -/* - * Define some mini Kalman filter for estimating various averages, etc. - * To make it more efficient the denominator of the fixed point rational - * numbers used to store the averages and the response half life will - * be chosen so that the fixed point rational number reperesentation - * of (1 - alpha) * i (where i is an integer) will be i. - * Some of this is defined in linux/sched.h - */ - -/* - * Fixed denominator rational numbers for use by the CPU scheduler + * Some helpers for converting nanosecond timing to jiffy resolution */ -#define SCHED_AVG_OFFSET 4 -/* - * Get the rounded integer value of a scheduling statistic average field - * i.e. those fields whose names begin with avg_ - */ -#define SCHED_AVG_RND(x) \ - (((x) + (1 << (SCHED_AVG_OFFSET - 1))) >> (SCHED_AVG_OFFSET)) -#define SCHED_AVG_ALPHA ((1 << SCHED_AVG_OFFSET) - 1) -#define SCHED_AVG_ONE (1UL << SCHED_AVG_OFFSET) -#define SCHED_AVG_MUL(a, b) (((a) * (b)) >> SCHED_AVG_OFFSET) -#define SCHED_AVG_REAL(a) ((a) << SCHED_AVG_OFFSET) - -/* - * Convert nice to shares - * Proportional symmetry is aimed for: i.e. - * (nice_to_shares(0) / nice_to_shares(19)) == (nice_to_shares(-20) / nice_to_shares(0)) - * Make sure that this function is robust for variations of EB_SHARES_PER_NICE - */ -static inline unsigned int nice_to_shares(int nice) -{ - unsigned int result = DEFAULT_EB_SHARES; - - if (nice > 0) - result -= (nice * (20 * EB_SHARES_PER_NICE - 1)) / 19; - else if (nice < 0) - result += (nice * nice * ((20 * EB_SHARES_PER_NICE - 1) * EB_SHARES_PER_NICE)) / 20; - - return result; -} - -static inline int shares_to_nice(unsigned int shares) -{ - int result = 0; - - if (shares > DEFAULT_EB_SHARES) - result = -int_sqrt((20 * (shares - DEFAULT_EB_SHARES)) / - (EB_SHARES_PER_NICE * (20 * EB_SHARES_PER_NICE - 1))); - else if (shares < DEFAULT_EB_SHARES) - result = (19 * (DEFAULT_EB_SHARES - shares)) / - (20 * EB_SHARES_PER_NICE - 1); - - return result; -} +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define SCHED_IA_BONUS_OFFSET 8 -#define SCHED_IA_BONUS_ALPHA ((1 << SCHED_IA_BONUS_OFFSET) - 1) -#define SCHED_IA_BONUS_MUL(a, b) (((a) * (b)) >> SCHED_IA_BONUS_OFFSET) /* - * Get the rounded integer value of the interactive bonus - */ -#define SCHED_IA_BONUS_RND(x) \ - (((x) + (1 << (SCHED_IA_BONUS_OFFSET - 1))) >> (SCHED_IA_BONUS_OFFSET)) - -static inline void apply_sched_avg_decay(unsigned long long *valp) -{ - *valp *= SCHED_AVG_ALPHA; - *valp >>= SCHED_AVG_OFFSET; -} - -static inline unsigned long long sched_div_64(unsigned long long a, unsigned long long b) -{ -#if BITS_PER_LONG < 64 - /* - * Assume that there's no 64 bit divide available - */ - if (a < b) - return 0; - /* - * Scale down until b less than 32 bits so that we can do - * a divide using do_div() - */ - while (b > ULONG_MAX) { a >>= 1; b >>= 1; } - - (void)do_div(a, (unsigned long)b); + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. + */ +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +#define CREDIT_LIMIT 100 + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) - return a; +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) #else - return a / b; +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) #endif -} -#define PROPORTION_OFFSET 24 -#if PROPORTION_OFFSET > 32 -#error "PROPORTION_OFFSET must be less than or equal to 32" -#endif -#define PROPORTION_OVERFLOW ((1ULL << (64 - PROPORTION_OFFSET)) - 1) -#define PROP_FM_PPT(a) (((unsigned long long)(a) * PROPORTION_ONE) / 1000) -unsigned long long ppt_to_proportion(unsigned long long ppt) -{ - return sched_div_64(ppt * PROPORTION_ONE, 1000); -} -/* - * Convert a / b to a proportion in the range 0 to PROPORTION_ONE - * Requires a <= b or may get a divide by zero exception - */ -static inline unsigned long long calc_proportion(unsigned long long a, unsigned long long b) -{ - if (unlikely(a == b)) - return PROPORTION_ONE; +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) - while (a > PROPORTION_OVERFLOW) { a >>= 1; b >>= 1; } +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) - return sched_div_64(a << PROPORTION_OFFSET, b); -} +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) -/* - * Map the given proportion to an unsigned long long in the specified range - * Requires range < PROPORTION_ONE to avoid overflow - */ -static inline unsigned long long map_proportion(unsigned long long prop, unsigned long long range) -{ - return (prop * range) >> PROPORTION_OFFSET; -} - -static inline unsigned long long map_proportion_rnd(unsigned long long prop, unsigned long long range) -{ - return map_proportion((prop >> 1), (range * 2 + 1)); -} - -/* - * Find the square root of a proportion - * Require: x <= PROPORTION_ONE - */ -static unsigned long long proportion_sqrt(unsigned long long x) -{ - unsigned long long res, b; - int bshift; +#define INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - /* - * Take shortcut AND prevent overflow - */ - if (x == PROPORTION_ONE) - return PROPORTION_ONE; +#define HIGH_CREDIT(p) \ + ((p)->interactive_credit > CREDIT_LIMIT) - res = 0; - b = (1UL << (PROPORTION_OFFSET - 1)); - bshift = PROPORTION_OFFSET - 1; - x <<= PROPORTION_OFFSET; +#define LOW_CREDIT(p) \ + ((p)->interactive_credit < -CREDIT_LIMIT) - for (; x && b; b >>= 1, bshift--) { - unsigned long long temp = (((res << 1) + b) << bshift); - - if (x >= temp) { - res += b; - x -= temp; - } - } - - return res; -} - -/* - * Tasks that have a CPU usage rate greater than this threshold (in parts per - * thousand) are considered to be CPU bound and start to lose interactive bonus - * points - */ -#define DEFAULT_CPU_HOG_THRESHOLD 900 -static unsigned int cpu_hog_threshold_ppt = DEFAULT_CPU_HOG_THRESHOLD; -static unsigned long long cpu_hog_threshold = PROP_FM_PPT(DEFAULT_CPU_HOG_THRESHOLD); - -/* - * Tasks that would sleep for more than 900 parts per thousand of the time if - * they had the CPU to themselves are considered to be interactive provided - * that their average sleep duration per scheduling cycle isn't too long - */ -#define DEFAULT_IA_THRESHOLD 900 -static unsigned int ia_threshold_ppt = DEFAULT_IA_THRESHOLD; -static unsigned long long ia_threshold = PROP_FM_PPT(DEFAULT_IA_THRESHOLD); -#define LOWER_MAX_IA_SLEEP SCHED_AVG_REAL(15 * 60LL * NSEC_PER_SEC) -#define UPPER_MAX_IA_SLEEP SCHED_AVG_REAL(2 * 60 * 60LL * NSEC_PER_SEC) +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) /* - * UNPRIV_RT tasks that have a CPU usage rate less than this threshold - * (in parts per thousand) are treated as psuedo RT tasks - */ -#define DEFAULT_UNPRIV_RT_THRESHOLD 10 -static unsigned int unpriv_rt_threshold_ppt = DEFAULT_UNPRIV_RT_THRESHOLD; -static unsigned long long unpriv_rt_threshold = PROP_FM_PPT(DEFAULT_UNPRIV_RT_THRESHOLD); - -/* - * What "base time slice" for nice 0 and "average time slice" evaluated to + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. */ -#define MSECS_TO_JIFFIES(x) (((x) * (HZ * 2 + 1)) / 2000) -#define MSECS_TO_JIFFIES_MIN_1(x) (MSECS_TO_JIFFIES(x) ? MSECS_TO_JIFFIES(x) : 1) -#define DEFAULT_TIME_SLICE_MSECS 100 -#define MAX_TIME_SLICE_MSECS 1000 -#define DEFAULT_TIME_SLICE_TICKS MSECS_TO_JIFFIES_MIN_1(DEFAULT_TIME_SLICE_MSECS) -static unsigned int time_slice_ticks = DEFAULT_TIME_SLICE_TICKS; -static unsigned int sched_rr_time_slice_ticks = DEFAULT_TIME_SLICE_TICKS; -static unsigned int bgnd_time_slice_multiplier = 1; +#define SCALE_PRIO(x, prio) \ + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) -static inline int is_bgnd_task(const task_t *p) +static unsigned int task_timeslice(task_t *p) { - return p->cpu_rate_cap == 0; -} - -static inline unsigned int task_timeslice(const task_t *p) -{ - if (unlikely(p->policy == SCHED_RR)) - return sched_rr_time_slice_ticks; - - if (unlikely(is_bgnd_task(p) && !(p->flags & PF_UISLEEP))) - return time_slice_ticks * bgnd_time_slice_multiplier; - - return time_slice_ticks; + if (p->static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); } - -#define task_hot(p, sd) ((p)->rq->timestamp_last_tick - (p)->timestamp < (sd)->cache_hot_time) +#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) /* * These are the runqueue data structures: */ -#define NUM_PRIO_SLOTS (IDLE_PRIO + 1) -/* - * Is the run queue idle? - */ -#define RUNQUEUE_IDLE(rq) ((rq)->curr == (rq)->idle) +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) -/* - * Control values for niceness - */ -#define PROSPECTIVE_BASE_PROM_INTERVAL_MSECS ((DEFAULT_TIME_SLICE_MSECS * 110) / 100) -#if (PROSPECTIVE_BASE_PROM_INTERVAL_MSECS > 0) -#define BASE_PROM_INTERVAL_MSECS PROSPECTIVE_BASE_PROM_INTERVAL_MSECS -#else -#define BASE_PROM_INTERVAL_MSECS DEFAULT_TIME_SLICE_MSECS -#endif -static unsigned int base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(BASE_PROM_INTERVAL_MSECS); +typedef struct runqueue runqueue_t; -struct prio_slot { - unsigned int prio; - struct list_head queue; +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; }; /* @@ -373,23 +217,15 @@ struct runqueue { #ifdef CONFIG_SMP unsigned long cpu_load; #endif - unsigned long avg_nr_running; unsigned long long nr_switches; - unsigned long nr_uninterruptible; + unsigned long expired_timestamp, nr_uninterruptible; unsigned long long timestamp_last_tick; - unsigned long long total_delay; - unsigned long long total_sinbin; task_t *curr, *idle; struct mm_struct *prev_mm; - DECLARE_BITMAP(bitmap, NUM_PRIO_SLOTS); - struct prio_slot queues[NUM_PRIO_SLOTS]; - unsigned long next_prom_due; - unsigned long pcount; + prio_array_t *active, *expired, arrays[2]; + int best_expired_prio; atomic_t nr_iowait; - unsigned long long eb_yardstick; - unsigned long long eb_ticks_to_decay; - #ifdef CONFIG_SMP struct sched_domain *sd; @@ -451,133 +287,41 @@ static DEFINE_PER_CPU(struct runqueue, r #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define is_idle_task(p) ((p) == (p)->rq->idle) - -#ifdef CONFIG_SMP -void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - BUG_ON(!list_empty(&p->run_list)); - - p->thread_info->cpu = cpu; - p->rq = cpu_rq(cpu); -} - -/* - * "p"'s runqueue and "oldrq" are locked when this is called - */ -static inline void adjust_timestamp(task_t *p, const runqueue_t *oldrq) -{ - p->timestamp += (p->rq->timestamp_last_tick - oldrq->timestamp_last_tick); -} - -/* - * adjust_sched_timestamp() is always called with p's runqueue locked but sometimes - * "oldrq" isn't locked and isn't "this_rq()" (e.g. in try_to_wake_up()) - * leading to possible (very rare) problems on systems where 64 bit reads are - * not atomic. - * - * We'll handle this problem by reading their "timestamp_last_tick"s until we - * get two the same. - */ -static inline void adjust_sched_timestamp(task_t *p, const runqueue_t *oldrq) -{ - unsigned long long oldrq_tlt = oldrq->timestamp_last_tick; - - if (oldrq != this_rq()) - while (unlikely(oldrq_tlt != oldrq->timestamp_last_tick)) - oldrq_tlt = oldrq->timestamp_last_tick; - - p->sched_timestamp += p->rq->timestamp_last_tick - oldrq_tlt; -} - -/* - * for use when the task may be on another CPU (to compensate for drift) - * - * This is only ever called when "p"'s runqueue is locked. - * Even though "this_rq()" may not be locked this should be safe as - * "timestamp_last_tick" is only ever changed by tasks running on the same CPU - * and so it won't be being changed while we read it. - */ -static inline unsigned long long adjusted_sched_clock(const task_t *p) -{ - runqueue_t *trq = this_rq(); - - return sched_clock() + (p->rq->timestamp_last_tick - trq->timestamp_last_tick); -} - -#else -#define adjust_timestamp(p, oldrq) -#define adjust_sched_timestamp(p, oldrq) -#define adjusted_sched_clock(p) sched_clock() -#endif - /* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while (0) # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_is_running(p) ((p)->rq->curr == (p)) -#else -# define task_is_running(p) task_running((p)->rq, p) +# define task_running(rq, p) ((rq)->curr == (p)) #endif -#define task_is_exiting(p) (unlikely(((p)->flags & PF_EXITING) != 0)) -#define task_is_sinbinned(p) (unlikely(((p)->flags & PF_SINBINNED) != 0)) -#define task_is_unpriv_rt(p) (unlikely(((p)->flags & PF_UNPRIV_RT) != 0)) - -static inline void restart_promotions(struct runqueue *rq) -{ - rq->next_prom_due = jiffies + base_prom_interval_ticks; - rq->pcount = 1; -} - -/* make it (relatively) easy to switch to using a timer */ -static inline void stop_promotions(struct runqueue *rq) -{ -} - -static inline void decay_eb_yardstick(runqueue_t *rq) -{ - static const unsigned long long decay_per_interval = PROP_FM_PPT(990); - unsigned long long pny; /* potential new yardstick */ - - rq->eb_yardstick = map_proportion(decay_per_interval, rq->eb_yardstick); - rq->eb_ticks_to_decay = time_slice_ticks; - if (unlikely(rt_task(rq->curr) || is_bgnd_task(rq->curr))) - return; - if (rq->curr->cpu_usage_rate < rq->curr->cpu_rate_cap) - pny = sched_div_64(rq->curr->cpu_usage_rate, rq->curr->eb_shares); - else - pny = sched_div_64(rq->curr->cpu_rate_cap, rq->curr->eb_shares); - if (pny > rq->eb_yardstick) - rq->eb_yardstick = pny; -} /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static spinlock_t *task_rq_lock(const task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { - spinlock_t *rql; + struct runqueue *rq; repeat_lock_task: local_irq_save(*flags); - rql = &p->rq->lock; - spin_lock(rql); - if (unlikely(rql != &p->rq->lock)) { - spin_unlock_irqrestore(rql, *flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); goto repeat_lock_task; } - return rql; + return rq; } -static inline void task_rq_unlock(spinlock_t *rql, unsigned long *flags) +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) { - spin_unlock_irqrestore(rql, *flags); + spin_unlock_irqrestore(&rq->lock, *flags); } #ifdef CONFIG_SCHEDSTATS @@ -682,7 +426,7 @@ struct file_operations proc_schedstat_op /* * rq_lock - lock a given runqueue and disable interrupts. */ -static spinlock_t *this_rq_lock(void) +static runqueue_t *this_rq_lock(void) { runqueue_t *rq; @@ -690,7 +434,12 @@ static spinlock_t *this_rq_lock(void) rq = this_rq(); spin_lock(&rq->lock); - return &rq->lock; + return rq; +} + +static inline void rq_unlock(runqueue_t *rq) +{ + spin_unlock_irq(&rq->lock); } #ifdef CONFIG_SCHEDSTATS @@ -722,6 +471,7 @@ static inline void sched_info_dequeued(t static inline void sched_info_arrive(task_t *t) { unsigned long now = jiffies, diff = 0; + struct runqueue *rq = task_rq(t); if (t->sched_info.last_queued) diff = now - t->sched_info.last_queued; @@ -730,11 +480,11 @@ static inline void sched_info_arrive(tas t->sched_info.last_arrival = now; t->sched_info.pcnt++; - if (!t->rq) + if (!rq) return; - t->rq->rq_sched_info.run_delay += diff; - t->rq->rq_sched_info.pcnt++; + rq->rq_sched_info.run_delay += diff; + rq->rq_sched_info.pcnt++; } /* @@ -764,12 +514,13 @@ static inline void sched_info_queued(tas */ static inline void sched_info_depart(task_t *t) { + struct runqueue *rq = task_rq(t); unsigned long diff = jiffies - t->sched_info.last_arrival; t->sched_info.cpu_time += diff; - if (t->rq) - t->rq->rq_sched_info.cpu_time += diff; + if (rq) + rq->rq_sched_info.cpu_time += diff; } /* @@ -779,7 +530,7 @@ static inline void sched_info_depart(tas */ static inline void sched_info_switch(task_t *prev, task_t *next) { - struct runqueue *rq = prev->rq; + struct runqueue *rq = task_rq(prev); /* * prev now departs the cpu. It's not interesting to record @@ -797,43 +548,24 @@ static inline void sched_info_switch(tas #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ -static inline int task_preempts_curr(const struct task_struct *p) -{ - return (p->prio < p->rq->curr->prio) && !task_is_exiting(p->rq->curr); -} - -static inline int task_queued(const task_t *task) -{ - return !list_empty(&task->run_list); -} - /* - * Adding/removing a task to/from a runqueue: + * Adding/removing a task to/from a priority array: */ -static void dequeue_task(struct task_struct *p) +static void dequeue_task(struct task_struct *p, prio_array_t *array) { - /* - * If p is the last task in this priority slot then slotp will be - * a pointer to the head of the list in the sunqueue structure - * NB we can't use p->prio for bitmap as task may have been - * promoted - */ - struct list_head *slotp = p->run_list.next; - - /* - * Initialize after removal from the list so that list_empty() works - * as a means for testing whether the task is runnable - */ - list_del_init(&p->run_list); - if (list_empty(slotp)) - __clear_bit(list_entry(slotp, struct prio_slot, queue)->prio, p->rq->bitmap); + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); } -static void enqueue_task(struct task_struct *p) +static void enqueue_task(struct task_struct *p, prio_array_t *array) { sched_info_queued(p); - list_add_tail(&p->run_list, &p->rq->queues[p->prio].queue); - __set_bit(p->prio, p->rq->bitmap); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; } /* @@ -841,447 +573,196 @@ static void enqueue_task(struct task_str * remote queue so we want these tasks to show up at the head of the * local queue: */ -static inline void enqueue_task_head(struct task_struct *p) +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) { - list_add(&p->run_list, &p->rq->queues[p->prio].queue); - __set_bit(p->prio, p->rq->bitmap); + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; } /* * effective_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. */ -static inline int effective_prio(const task_t *p) +static int effective_prio(task_t *p) { - unsigned int bonus_factor = 0; + int bonus, prio; if (rt_task(p)) return p->prio; - if (unlikely(is_bgnd_task(p) && !(p->flags & PF_UISLEEP))) - return BGND_PRIO; - - if (task_is_unpriv_rt(p) && (p->cpu_usage_rate < unpriv_rt_threshold)) - return MAX_RT_PRIO; - - /* - * kernel threads get maximum bonuses and tasks that are - * over their cap get no bonuses - */ - if (p->mm == NULL) - bonus_factor = MAX_TOTAL_BONUS; - else if (p->cpu_usage_rate < p->cpu_rate_cap) { - bonus_factor = SCHED_IA_BONUS_RND(p->interactive_bonus); - bonus_factor += p->throughput_bonus; - } + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - return p->pre_bonus_priority - bonus_factor; + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; } /* * __activate_task - move a task to the runqueue. */ -static inline void __activate_task(task_t *p) +static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p); - p->rq->nr_running++; - if (p->rq->nr_running == 2) - restart_promotions(p->rq); + enqueue_task(p, rq->active); + rq->nr_running++; } /* - * activate task on the _front_ of runqueue. + * __activate_idle_task - move idle task to the _front_ of runqueue. */ -static inline void __activate_task_head(task_t *p) +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - enqueue_task_head(p); - p->rq->nr_running++; - if (p->rq->nr_running == 2) - restart_promotions(p->rq); + enqueue_task_head(p, rq->active); + rq->nr_running++; } -/* - * Calculate CPU usage rate and sleepiness. - * This never gets called on real time tasks - */ -static void decay_avgs_and_calculate_rates(task_t *p) -{ - unsigned long long bl; - - apply_sched_avg_decay(&p->avg_sleep_per_cycle); - apply_sched_avg_decay(&p->avg_delay_per_cycle); - apply_sched_avg_decay(&p->avg_cpu_per_cycle); - bl = p->avg_sleep_per_cycle + p->avg_cpu_per_cycle; - /* - * Take a shortcut and avoid possible divide by zero later - */ - if (unlikely(bl == 0)) { - p->sleepiness = PROPORTION_ONE; - p->cpu_usage_rate = 0; - } else { - p->sleepiness = calc_proportion(p->avg_sleep_per_cycle, bl); - bl += p->avg_delay_per_cycle; - p->cpu_usage_rate = calc_proportion(p->avg_cpu_per_cycle, bl); - } -} - -/* - * Calculate priority based priority (without bonuses). - * This never gets called on real time tasks - */ -static inline void calculate_pb_pre_bonus_priority(task_t *p) -{ - if (unlikely(p->cpu_usage_rate > p->cpu_rate_cap)) { - p->pre_bonus_priority = BGND_PRIO - 1; - if (p->cpu_rate_cap != 0) { - unsigned long long prop = PROPORTION_ONE; - - prop -= calc_proportion(p->cpu_rate_cap, p->cpu_usage_rate); - p->pre_bonus_priority -= map_proportion(prop, MAX_PRIO - p->static_prio); - } - } else - p->pre_bonus_priority = p->static_prio + MAX_TOTAL_BONUS; -} - -/* - * Calculate entitlement based priority (without bonuses). - * This never gets called on real time tasks - */ -#define EB_PAR ((MAX_PRIO - MAX_RT_PRIO - 1) / 2) -static void calculate_eb_pre_bonus_priority(task_t *p) +static void recalc_task_prio(task_t *p, unsigned long long now) { - /* - * Prevent possible divide by zero and take shortcut - */ - if (unlikely(p->cpu_rate_cap == 0)) { - p->pre_bonus_priority = BGND_PRIO - 1; - } else if (p->cpu_usage_rate > p->cpu_rate_cap) { - unsigned long long cap_per_share = sched_div_64(p->cpu_rate_cap, p->eb_shares); - unsigned long long prop = calc_proportion(p->cpu_rate_cap, p->cpu_usage_rate); - - p->pre_bonus_priority = (BGND_PRIO - 1); - p->pre_bonus_priority -= map_proportion_rnd(prop, EB_PAR + 1); - if (cap_per_share > p->rq->eb_yardstick) - p->rq->eb_yardstick = cap_per_share; - } else { - unsigned long long usage_per_share = sched_div_64(p->cpu_usage_rate, p->eb_shares); - - if (usage_per_share > p->rq->eb_yardstick) { - p->rq->eb_yardstick = usage_per_share; - p->pre_bonus_priority = MAX_RT_PRIO + MAX_TOTAL_BONUS + EB_PAR; - } else { - unsigned long long prop; + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; - prop = calc_proportion(usage_per_share, p->rq->eb_yardstick); - p->pre_bonus_priority = MAX_RT_PRIO + MAX_TOTAL_BONUS; - p->pre_bonus_priority += map_proportion_rnd(prop, EB_PAR); - } - } -} - -static inline void calculate_pre_bonus_priority(task_t *p) -{ - if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) - calculate_eb_pre_bonus_priority(p); + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; else - calculate_pb_pre_bonus_priority(p); -} - -/* - * Initialize the scheduling statistics counters - */ -static inline void initialize_stats(task_t *p) -{ - p->avg_sleep_per_cycle = 0; - p->avg_delay_per_cycle = 0; - p->avg_cpu_per_cycle = 0; - p->total_sleep = 0; - p->total_delay = 0; - p->total_cpu = 0; - p->total_sinbin = 0; - p->cycle_count = 0; - p->intr_wake_ups = 0; - p->sched_timestamp = sched_clock(); -} - -/* - * sched_clock() is not necessarily monotonic and this can lead to negative - * values when very small time intervals are measured using successive calls - * to sched_clock(). The "delay" statistic is the most vulnerable to this BUT - * we'll take precautions for all interval measurements. Where a time interval - * would be negative we'll treat it as zero and NOT update the timestamp either - * as this would lead to the next interval measured being to big. - */ -static inline void delta_sleep_stats(task_t *p, unsigned long long now) -{ - unsigned long long delta; - - /* sched_clock() is not guaranteed monotonic */ - if (now <= p->sched_timestamp) { - p->sched_timestamp = now; - return; - } - - delta = now - p->sched_timestamp; - p->sched_timestamp = now; - p->avg_sleep_per_cycle += delta; - p->total_sleep += delta; -} + sleep_time = (unsigned long)__sleep_time; -static inline void delta_cpu_stats(task_t *p, unsigned long long now) -{ - unsigned long long delta; + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->activated != -1 && + sleep_time > INTERACTIVE_SLEEP(p)) { + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + DEF_TIMESLICE); + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - /* sched_clock() is not guaranteed monotonic */ - if (now <= p->sched_timestamp) { - p->sched_timestamp = now; - return; - } + /* + * Tasks with low interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + if (LOW_CREDIT(p) && + sleep_time > JIFFIES_TO_NS(task_timeslice(p))) + sleep_time = JIFFIES_TO_NS(task_timeslice(p)); - delta = now - p->sched_timestamp; - p->sched_timestamp = now; - p->avg_cpu_per_cycle += delta; - p->total_cpu += delta; -} + /* + * Non high_credit tasks waking from uninterruptible + * sleep are limited in their sleep_avg rise as they + * are likely to be cpu hogs waiting on I/O + */ + if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { + if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + INTERACTIVE_SLEEP(p)) { + p->sleep_avg = INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } -static inline void delta_delay_stats(task_t *p, unsigned long long now) -{ - unsigned long long delta; + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a + * task spends sleeping, the higher the average gets - + * and the higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; - /* sched_clock() is not guaranteed monotonic */ - if (now <= p->sched_timestamp) { - p->sched_timestamp = now; - return; + if (p->sleep_avg > NS_MAX_SLEEP_AVG) { + p->sleep_avg = NS_MAX_SLEEP_AVG; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } + } } - delta = now - p->sched_timestamp; - p->sched_timestamp = now; - p->avg_delay_per_cycle += delta; - p->total_delay += delta; - p->rq->total_delay += delta; - if (task_is_sinbinned(p)) { - p->total_sinbin += delta; - p->rq->total_sinbin += delta; - } + p->prio = effective_prio(p); } /* - * Update various statistics for the end of a - * ((on_run_queue :-> on_cpu)* :-> sleep) cycle. - * We can't just do this in activate_task() as every invocation of that - * function is not the genuine end of a cycle. + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) */ -static void update_stats_for_cycle(task_t *p) +static void activate_task(task_t *p, runqueue_t *rq, int local) { - unsigned long long now = adjusted_sched_clock(p); - - delta_sleep_stats(p, now); - if (in_interrupt()) - p->intr_wake_ups++; - p->cycle_count++; - if (!rt_task(p)) - decay_avgs_and_calculate_rates(p); -} - -static inline void decay_sched_ia_bonus(struct task_struct *p) -{ - p->interactive_bonus *= SCHED_IA_BONUS_ALPHA; - p->interactive_bonus >>= SCHED_IA_BONUS_OFFSET; -} + unsigned long long now; -/* - * Check whether a task with an interactive bonus still qualifies and if not - * decrease its bonus - * This never gets called on real time tasks - */ -static void reassess_cpu_boundness(task_t *p) -{ - if (max_ia_bonus == 0) { - p->interactive_bonus = 0; - return; + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; } - /* - * No point going any further if there's no bonus to lose - */ - if (p->interactive_bonus == 0) - return; +#endif - if (p->cpu_usage_rate > cpu_hog_threshold) - decay_sched_ia_bonus(p); -} + recalc_task_prio(p, now); -/* - * Check whether a task qualifies for an interactive bonus and if it does - * increase its bonus - * This never gets called on real time tasks - */ -static void reassess_interactiveness(task_t *p) -{ - if (max_ia_bonus == 0) { - p->interactive_bonus = 0; - return; - } /* - * No sleep means not interactive (in most cases), but + * This checks to make sure it's not an uninterruptible task + * that is now waking up. */ - if (unlikely(p->avg_sleep_per_cycle > LOWER_MAX_IA_SLEEP)) { + if (!p->activated) { /* - * Really long sleeps mean it's probably not interactive + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: */ - if (unlikely(p->avg_sleep_per_cycle > UPPER_MAX_IA_SLEEP)) - decay_sched_ia_bonus(p); - return; - } - if (p->sleepiness > ia_threshold) { - decay_sched_ia_bonus(p); - p->interactive_bonus += map_proportion_rnd(p->sleepiness, max_ia_bonus); - } -} - -/* - * Check whether a task qualifies for a throughput bonus and if it does - * give it one - * This never gets called on real time tasks - */ -static void recalc_throughput_bonus(task_t *p) -{ - unsigned long long ratio; - unsigned long long expected_delay; - unsigned long long adjusted_delay; - unsigned long long load = p->rq->avg_nr_running; - - p->throughput_bonus = 0; - if (max_tpt_bonus == 0) - return; - - if (load <= SCHED_AVG_ONE) - expected_delay = 0; - else - expected_delay = SCHED_AVG_MUL(p->avg_cpu_per_cycle, (load - SCHED_AVG_ONE)); - - /* - * No unexpected delay means no bonus, but - * NB this test also avoids a possible divide by zero error if - * cpu is also zero and negative bonuses - */ - if (p->avg_delay_per_cycle <= expected_delay) - return; - - adjusted_delay = p->avg_delay_per_cycle - expected_delay; - ratio = calc_proportion(adjusted_delay, adjusted_delay + p->avg_cpu_per_cycle); - ratio = proportion_sqrt(ratio); - p->throughput_bonus = map_proportion_rnd(ratio, max_tpt_bonus); -} - -static void recalc_task_prio(task_t *p, unsigned long long now) -{ - /* - * Throughput bonus is dependent on how busy the CPU is so do it here to - * catch any CPU changes - * Interactive bonus is updated in the wake up function. - */ - if (!rt_task(p)) { - recalc_throughput_bonus(p); - calculate_pre_bonus_priority(p); + if (in_interrupt()) + p->activated = 2; + else { + /* + * Normal first-time wakeups get a credit too for + * on-runqueue time, but it will be weighted down: + */ + p->activated = 1; + } } - p->prio = effective_prio(p); -} - -/* - * activate_task - move a task to the runqueue and do priority recalculation - */ -static void activate_task(task_t *p) -{ - /* Compensate for drifting sched_clock */ - unsigned long long now = adjusted_sched_clock(p); - - recalc_task_prio(p, now); p->timestamp = now; - p->time_slice = task_timeslice(p); - p->flags &= ~PF_UISLEEP; - __activate_task(p); + __activate_task(p, rq); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p) +static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - p->rq->nr_running--; - if (p->state == TASK_UNINTERRUPTIBLE) { - p->flags |= PF_UISLEEP; - p->rq->nr_uninterruptible++; - } - dequeue_task(p); - if (p->rq->nr_running == 1) - stop_promotions(p->rq); -} - -/* - * p->cpu_usage_rate must be greater than p->cpu_rate_hard_cap - */ -static inline unsigned long required_sinbin_period(const task_t *p) -{ - unsigned long long acpc_jiffies, bl, tl; - - if (p->cpu_rate_hard_cap == 0) - return ULONG_MAX; - - acpc_jiffies = sched_div_64(SCHED_AVG_RND(p->avg_cpu_per_cycle) * HZ, 1000000000); - /* - * we have to be careful about overflow and/or underflow - */ - bl = p->cpu_usage_rate * p->cpu_rate_hard_cap; - tl = acpc_jiffies * (p->cpu_usage_rate - p->cpu_rate_hard_cap); - while (tl > PROPORTION_OVERFLOW) { - tl >>= 1; - if (unlikely((bl >>= 1) == 0)) - return ULONG_MAX; - } - - return sched_div_64(tl << PROPORTION_OFFSET, bl); -} - -static inline int task_needs_sinbinning(const struct task_struct *p) -{ - return (p->cpu_usage_rate > p->cpu_rate_hard_cap) && - (p->state == TASK_RUNNING) && !rt_task(p) && !task_is_exiting(p); -} - -static inline void put_task_in_sinbin(struct task_struct *p) -{ - unsigned long long durn = required_sinbin_period(p); - - if (durn == 0) - return; - deactivate_task(p); - p->flags |= PF_SINBINNED; - p->sinbin_timer.expires = jiffies + durn; - add_timer(&p->sinbin_timer); -} - -/* - * Release a task from the sinbin - */ -void sinbin_release_fn(unsigned long arg) -{ - unsigned long flags; - struct task_struct *p = (struct task_struct*)arg; - spinlock_t *rql = task_rq_lock(p, &flags); - - /* - * Sinbin time is included in delay time - */ - delta_delay_stats(p, adjusted_sched_clock(p)); - p->flags &= ~PF_SINBINNED; - if (!rt_task(p)) { - calculate_pre_bonus_priority(p); - p->prio = effective_prio(p); - } - __activate_task(p); - - task_rq_unlock(rql, &flags); + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p, p->array); + p->array = NULL; } /* @@ -1296,7 +777,7 @@ static void resched_task(task_t *p) { int need_resched, nrpolling; - BUG_ON(!spin_is_locked(&p->rq->lock)); + BUG_ON(!spin_is_locked(&task_rq(p)->lock)); /* minimise the chance of sending an interrupt to poll_idle() */ nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); @@ -1313,19 +794,13 @@ static inline void resched_task(task_t * } #endif -static inline void preempt_curr_if_warranted(struct task_struct *p) -{ - if (task_preempts_curr(p)) - resched_task(p->rq->curr); -} - /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ inline int task_curr(const task_t *p) { - return task_is_running(p); + return cpu_curr(task_cpu(p)) == p; } #ifdef CONFIG_SMP @@ -1354,18 +829,14 @@ typedef struct { */ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { + runqueue_t *rq = task_rq(p); + /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!task_queued(p) && !task_is_running(p)) { - if (task_is_sinbinned(p)) - delta_delay_stats(p, adjusted_sched_clock(p)); - else - delta_sleep_stats(p, adjusted_sched_clock(p)); + if (!p->array && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); - /* time stamp was set for old queue above so fix it */ - p->sched_timestamp = adjusted_sched_clock(p); return 0; } @@ -1373,7 +844,7 @@ static int migrate_task(task_t *p, int d req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; - list_add(&req->list, &p->rq->migration_queue); + list_add(&req->list, &rq->migration_queue); return 1; } @@ -1389,22 +860,22 @@ static int migrate_task(task_t *p, int d void wait_task_inactive(task_t * p) { unsigned long flags; - spinlock_t *rql; + runqueue_t *rq; int preempted; repeat: - rql = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(task_queued(p))) { + if (unlikely(p->array)) { /* If it's preempted, we yield. It could be a while. */ - preempted = !task_is_running(p); - task_rq_unlock(rql, &flags); + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); cpu_relax(); if (preempted) yield(); goto repeat; } - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); } /*** @@ -1511,34 +982,27 @@ static int try_to_wake_up(task_t * p, un int cpu, this_cpu, success = 0; unsigned long flags; long old_state; - spinlock_t *rql; - runqueue_t *old_rq; + runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; struct sched_domain *sd; int new_cpu; #endif - rql = task_rq_lock(p, &flags); - old_rq = p->rq; - schedstat_inc(p->rq, ttwu_cnt); + rq = task_rq_lock(p, &flags); + schedstat_inc(rq, ttwu_cnt); old_state = p->state; if (!(old_state & state)) goto out; - if (task_queued(p)) + if (p->array) goto out_running; - /* - * This is the end of one scheduling cycle and the start of the next - */ - update_stats_for_cycle(p); - cpu = task_cpu(p); this_cpu = smp_processor_id(); #ifdef CONFIG_SMP - if (unlikely(task_is_running(p))) + if (unlikely(task_running(rq, p))) goto out_activate; new_cpu = cpu; @@ -1575,7 +1039,7 @@ static int try_to_wake_up(task_t * p, un imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; if ((sd->flags & SD_WAKE_AFFINE) && - !task_hot(p, sd)) { + !task_hot(p, rq->timestamp_last_tick, sd)) { /* * This domain has SD_WAKE_AFFINE and p is cache cold * in this domain. @@ -1599,19 +1063,18 @@ static int try_to_wake_up(task_t * p, un new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: - schedstat_inc(p->rq, ttwu_attempts); + schedstat_inc(rq, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { - schedstat_inc(p->rq, ttwu_moved); + schedstat_inc(rq, ttwu_moved); set_task_cpu(p, new_cpu); - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); /* might preempt at this point */ - rql = task_rq_lock(p, &flags); - adjust_sched_timestamp(p, old_rq); + rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) goto out; - if (task_queued(p)) + if (p->array) goto out_running; this_cpu = smp_processor_id(); @@ -1620,17 +1083,16 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) - old_rq->nr_uninterruptible--; + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } /* - * Do this here rather than in activate_task() because activate() gets - * called at times when thes calculations are unnecessary e.g. for a - * change of CPU - */ - if (!rt_task(p)) - reassess_interactiveness(p); - /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on @@ -1638,15 +1100,17 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - activate_task(p); - if (!sync || cpu != this_cpu) - preempt_curr_if_warranted(p); + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } success = 1; out_running: p->state = TASK_RUNNING; out: - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); return success; } @@ -1665,21 +1129,11 @@ int fastcall wake_up_state(task_t *p, un } #ifdef CONFIG_SMP -static int find_idlest_cpu(const struct task_struct *p, int this_cpu, +static int find_idlest_cpu(struct task_struct *p, int this_cpu, struct sched_domain *sd); #endif /* - * Initialize the scheduling bonuses - */ -static inline void initialize_bonuses(task_t *p) -{ - p->interactive_bonus = (max_ia_bonus >= initial_ia_bonus) ? - initial_ia_bonus : max_ia_bonus; - p->throughput_bonus = 0; -} - -/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ @@ -1693,9 +1147,8 @@ void fastcall sched_fork(task_t *p) */ p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); + p->array = NULL; spin_lock_init(&p->switch_lock); - init_timer(&p->sinbin_timer); - p->sinbin_timer.data = (unsigned long) p; #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif @@ -1709,15 +1162,32 @@ void fastcall sched_fork(task_t *p) p->thread_info->preempt_count = 1; #endif /* - * Give the child a new timeslice + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. */ - p->time_slice = task_timeslice(p); - p->timestamp = sched_clock(); + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; /* - * Initialize the scheduling statistics and bonus counters + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. */ - initialize_stats(p); - initialize_bonuses(p); + p->first_time_slice = 1; + current->time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + preempt_disable(); + scheduler_tick(0, 0); + local_irq_enable(); + preempt_enable(); + } else + local_irq_enable(); } /* @@ -1731,15 +1201,27 @@ void fastcall wake_up_new_task(task_t * { unsigned long flags; int this_cpu, cpu; - spinlock_t *rql; + runqueue_t *rq, *this_rq; - rql = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); cpu = task_cpu(p); this_cpu = smp_processor_id(); BUG_ON(p->state != TASK_RUNNING); - schedstat_inc(p->rq, wunt_cnt); + schedstat_inc(rq, wunt_cnt); + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->interactive_credit = 0; + + p->prio = effective_prio(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { @@ -1747,61 +1229,82 @@ void fastcall wake_up_new_task(task_t * * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. - * Now that the idle task is back on the run queue - * we need extra care to make sure that its one and - * only fork() doesn't end up in the idle priority slot. - * Just testing for empty run list is no longer adequate. */ - if (unlikely(!task_queued(current) || RUNQUEUE_IDLE(current->rq))) { - p->prio = effective_prio(p); - __activate_task(p); - } else { - /* - * Put the child on the same list(s) as (but - * ahead of) the parent - */ + if (unlikely(!current->array)) + __activate_task(p, rq); + else { p->prio = current->prio; list_add_tail(&p->run_list, ¤t->run_list); - current->rq->nr_running++; + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; } set_need_resched(); - } else { + } else /* Run child last */ - p->prio = effective_prio(p); - __activate_task(p); - } + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; } else { + this_rq = cpu_rq(this_cpu); + /* * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - adjust_timestamp(p, this_rq()); - adjust_sched_timestamp(p, this_rq()); - p->prio = effective_prio(p); - __activate_task(p); - preempt_curr_if_warranted(p); - schedstat_inc(p->rq, wunt_moved); + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + schedstat_inc(rq, wunt_moved); + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); } - task_rq_unlock(rql, &flags); + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); } -/** - * (Optionally) log scheduler statistics at exit. +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) */ -static int log_at_exit = 0; void fastcall sched_exit(task_t * p) { - struct task_sched_stats stats; - - if (!log_at_exit) - return; + unsigned long flags; + runqueue_t *rq; - get_task_sched_stats(p, &stats); - printk("SCHED_EXIT[%d] (%s) %llu %llu %llu %llu %llu %llu %lu %lu\n", - p->pid, p->comm, - stats.total_sleep, stats.total_cpu, stats.total_delay, - stats.total_sinbin, stats.cycle_count, stats.intr_wake_ups, - p->nvcsw, p->nivcsw); + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->first_time_slice) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > task_timeslice(p))) + p->parent->time_slice = task_timeslice(p); + } + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); + task_rq_unlock(rq, &flags); } /** @@ -1986,7 +1489,7 @@ static void double_lock_balance(runqueue /* * find_idlest_cpu - find the least busy runqueue. */ -static int find_idlest_cpu(const struct task_struct *p, int this_cpu, +static int find_idlest_cpu(struct task_struct *p, int this_cpu, struct sched_domain *sd) { unsigned long load, min_load, this_load; @@ -2037,28 +1540,28 @@ static int find_idlest_cpu(const struct static void sched_migrate_task(task_t *p, int dest_cpu) { migration_req_t req; - spinlock_t *rql; + runqueue_t *rq; unsigned long flags; - rql = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) || unlikely(cpu_is_offline(dest_cpu))) goto out; - schedstat_inc(p->rq, smt_cnt); + schedstat_inc(rq, smt_cnt); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ - struct task_struct *mt = p->rq->migration_thread; + struct task_struct *mt = rq->migration_thread; get_task_struct(mt); - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); return; } out: - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); } /* @@ -2101,26 +1604,29 @@ out: * Both runqueues must be locked. */ static inline -void pull_task(task_t *p, int this_cpu) +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { - runqueue_t *src_rq = p->rq; - - dequeue_task(p); + dequeue_task(p, src_array); src_rq->nr_running--; - delta_delay_stats(p, adjusted_sched_clock(p)); set_task_cpu(p, this_cpu); - p->rq->nr_running++; - enqueue_task(p); - adjust_timestamp(p, src_rq); - adjust_sched_timestamp(p, src_rq); - preempt_curr_if_warranted(p); + this_rq->nr_running++; + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); } /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static inline -int can_migrate_task(const task_t *p, int this_cpu, +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, struct sched_domain *sd, enum idle_type idle) { /* @@ -2129,7 +1635,7 @@ int can_migrate_task(const task_t *p, in * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_is_running(p)) + if (task_running(rq, p)) return 0; if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; @@ -2137,7 +1643,7 @@ int can_migrate_task(const task_t *p, in /* Aggressive migration if we've failed balancing */ if (idle == NEWLY_IDLE || sd->nr_balance_failed < sd->cache_nice_tries) { - if (task_hot(p, sd)) + if (task_hot(p, rq->timestamp_last_tick, sd)) return 0; } @@ -2155,6 +1661,7 @@ static int move_tasks(runqueue_t *this_r unsigned long max_nr_move, struct sched_domain *sd, enum idle_type idle) { + prio_array_t *array, *dst_array; struct list_head *head, *curr; int idx, pulled = 0; task_t *tmp; @@ -2162,24 +1669,45 @@ static int move_tasks(runqueue_t *this_r if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + } else { + array = busiest->active; + dst_array = this_rq->active; + } + +new_array: /* Start searching at priority 0: */ idx = 0; skip_bitmap: if (!idx) - idx = sched_find_first_bit(busiest->bitmap); + idx = sched_find_first_bit(array->bitmap); else - idx = find_next_bit(busiest->bitmap, IDLE_PRIO, idx); - if (idx >= IDLE_PRIO) + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired && busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + goto new_array; + } goto out; + } - head = &busiest->queues[idx].queue; + head = array->queue + idx; curr = head->prev; skip_queue: tmp = list_entry(curr, task_t, run_list); curr = curr->prev; - if (!can_migrate_task(tmp, this_cpu, sd, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; @@ -2194,7 +1722,7 @@ skip_queue: schedstat_inc(this_rq, pt_gained[idle]); schedstat_inc(busiest, pt_lost[idle]); - pull_task(tmp, this_cpu); + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; /* We only want to steal up to the prescribed number of tasks. */ @@ -2349,7 +1877,7 @@ out_balanced: /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static runqueue_t *find_busiest_queue(const struct sched_group *group) +static runqueue_t *find_busiest_queue(struct sched_group *group) { unsigned long load, max_load = 0; runqueue_t *busiest = NULL; @@ -2646,11 +2174,6 @@ static void rebalance_tick(int this_cpu, } } } - -static inline int needs_idle_balance(const runqueue_t *rq) -{ - return rq->nr_running == 0; -} #else /* * on UP we do not need to balance between CPUs: @@ -2661,10 +2184,6 @@ static inline void rebalance_tick(int cp static inline void idle_balance(int cpu, runqueue_t *rq) { } -static inline int needs_idle_balance(const runqueue_t *rq) -{ - return 0; -} #endif static inline int wake_priority_sleeper(runqueue_t *rq) @@ -2685,54 +2204,27 @@ static inline int wake_priority_sleeper( return ret; } -/* - * Are promotions due? - */ -static inline int promotions_due(const runqueue_t *rq) -{ - return unlikely(time_after_eq(jiffies, rq->next_prom_due)) && (rq->nr_running > 1); -} - -/* - * Assume runqueue lock is NOT already held. - * This is not executed when current task is SCHED_FIFO - */ -static void do_promotions(runqueue_t *rq) -{ - int idx = MIN_NORMAL_PRIO; - - spin_lock(&rq->lock); - rq->pcount++; - if (rq->nr_running < rq->pcount) { - rq->next_prom_due = jiffies + base_prom_interval_ticks; - goto out_unlock; - } - for (;;) { - int new_prio; - idx = find_next_bit(rq->bitmap, IDLE_PRIO, idx + 1); - /* don't promote background tasks */ - if (idx > (BGND_PRIO - 1)) - break; - - new_prio = idx - 1; - __list_splice(&rq->queues[idx].queue, rq->queues[new_prio].queue.prev); - INIT_LIST_HEAD(&rq->queues[idx].queue); - __clear_bit(idx, rq->bitmap); - __set_bit(new_prio, rq->bitmap); - } - /* The only prio field that might need updating is the current task's */ - if (likely((rq->curr->prio > MIN_NORMAL_PRIO) && (rq->curr->prio < BGND_PRIO))) - rq->curr->prio--; - restart_promotions(rq); -out_unlock: - spin_unlock(&rq->lock); -} - DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +#define EXPIRED_STARVING(rq) \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -2743,11 +2235,10 @@ void scheduler_tick(int user_ticks, int { int cpu = smp_processor_id(); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); task_t *p = current; - unsigned long decayed_avg_nr_running; - unsigned long long now; - now = p->rq->timestamp_last_tick = sched_clock(); + rq->timestamp_last_tick = sched_clock(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -2761,24 +2252,14 @@ void scheduler_tick(int user_ticks, int sys_ticks = 0; } - /* this has to be done regardless of task type but hold lock for the - * minimum possible time - */ - decayed_avg_nr_running = SCHED_AVG_MUL(p->rq->avg_nr_running, SCHED_AVG_ALPHA); - spin_lock(&p->rq->lock); - p->rq->avg_nr_running = decayed_avg_nr_running + p->rq->nr_running; - if ((sched_mode == SCHED_MODE_ENTITLEMENT_BASED) && (!--p->rq->eb_ticks_to_decay)) - decay_eb_yardstick(p->rq); - spin_unlock(&p->rq->lock); - - if (is_idle_task(p)) { - if (atomic_read(&p->rq->nr_iowait) > 0) + if (p == rq->idle) { + if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; - if (wake_priority_sleeper(p->rq)) + if (wake_priority_sleeper(rq)) goto out; - rebalance_tick(cpu, p->rq, SCHED_IDLE); + rebalance_tick(cpu, rq, SCHED_IDLE); return; } if (TASK_NICE(p) > 0) @@ -2787,38 +2268,82 @@ void scheduler_tick(int user_ticks, int cpustat->user += user_ticks; cpustat->system += sys_ticks; + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); /* - * SCHED_FIFO tasks never run out of timeslice. + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. */ - if (unlikely(p->policy == SCHED_FIFO)) - goto out; - - spin_lock(&p->rq->lock); + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); + } + goto out_unlock; + } if (!--p->time_slice) { - dequeue_task(p); + dequeue_task(p, rq->active); set_tsk_need_resched(p); - if (likely(p->policy != SCHED_RR)) { - delta_cpu_stats(p, now); - decay_avgs_and_calculate_rates(p); - recalc_throughput_bonus(p); - reassess_cpu_boundness(p); - /* - * Arguably the interactive bonus should be updated here - * as well. But depends on whether we wish to encourage - * interactive tasks to maintain a high bonus or CPU bound - * tasks to lose some of there bonus? - */ - calculate_pre_bonus_priority(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->expired); + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + + dequeue_task(p, rq->active); + set_tsk_need_resched(p); p->prio = effective_prio(p); + enqueue_task(p, rq->active); } - p->time_slice = task_timeslice(p); - enqueue_task(p); } - spin_unlock(&p->rq->lock); +out_unlock: + spin_unlock(&rq->lock); out: - rebalance_tick(cpu, p->rq, NOT_IDLE); - if (unlikely(promotions_due(p->rq))) - do_promotions(p->rq); + rebalance_tick(cpu, rq, NOT_IDLE); } #ifdef CONFIG_SCHED_SMT @@ -2871,7 +2396,8 @@ static inline int dependent_sleeper(int { struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; - int ret = 0, i, idx; + prio_array_t *array; + int ret = 0, i; task_t *p; if (!(sd->flags & SD_SHARE_CPUPOWER)) @@ -2893,11 +2419,13 @@ static inline int dependent_sleeper(int */ if (!this_rq->nr_running) goto out_unlock; + array = this_rq->active; + if (!array->nr_active) + array = this_rq->expired; + BUG_ON(!array->nr_active); - idx = sched_find_first_bit(this_rq->bitmap); - p = list_entry(this_rq->queues[idx].queue.next, task_t, run_list); - /* update prio in case p has been promoted since it was queued */ - p->prio = idx; + p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); @@ -2932,16 +2460,6 @@ out_unlock: spin_unlock(&cpu_rq(i)->lock); return ret; } - -static inline int recheck_needs_idle_balance(const runqueue_t *rq) -{ - return rq->nr_running == 0; -} - -static inline int dependent_idle(const runqueue_t *rq) -{ - return rq->nr_running == 0; -} #else static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { @@ -2951,16 +2469,6 @@ static inline int dependent_sleeper(int { return 0; } - -static inline int recheck_needs_idle_balance(const runqueue_t *rq) -{ - return 0; -} - -static inline int dependent_idle(const runqueue_t *rq) -{ - return 0; -} #endif /* @@ -2971,7 +2479,10 @@ asmlinkage void __sched schedule(void) long *switch_count; task_t *prev, *next; runqueue_t *rq; + prio_array_t *array; + struct list_head *queue; unsigned long long now; + unsigned long run_time; int cpu, idx; /* @@ -2991,7 +2502,7 @@ asmlinkage void __sched schedule(void) need_resched: preempt_disable(); prev = current; - rq = prev->rq; + rq = this_rq(); /* * The idle thread is not allowed to schedule! @@ -3005,6 +2516,18 @@ need_resched: release_kernel_lock(prev); schedstat_inc(rq, sched_cnt); now = sched_clock(); + if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks with interactive credits get charged less run_time + * at high sleep_avg to delay them losing their interactive + * status + */ + if (HIGH_CREDIT(prev)) + run_time /= (CURRENT_BONUS(prev) ? : 1); spin_lock_irq(&rq->lock); @@ -3019,25 +2542,26 @@ need_resched: unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else - deactivate_task(prev); + deactivate_task(prev, rq); } - if (unlikely(task_needs_sinbinning(prev))) - put_task_in_sinbin(prev); - cpu = smp_processor_id(); - if (unlikely(needs_idle_balance(rq))) { + if (unlikely(!rq->nr_running)) { go_idle: idle_balance(cpu, rq); - /* This code should get optimised away when CONFIG_SCHED_SMT - * is not defined - */ - if (dependent_idle(rq)) + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } } else { - /* This code should all get optimised away when CONFIG_SCHED_SMT - * is not defined - */ if (dependent_sleeper(cpu, rq)) { schedstat_inc(rq, sched_goidle); next = rq->idle; @@ -3048,29 +2572,55 @@ go_idle: * lock, hence go into the idle loop if the rq went * empty meanwhile: */ - if (unlikely(recheck_needs_idle_balance(rq))) + if (unlikely(!rq->nr_running)) goto go_idle; } - schedstat_inc(rq, sched_noswitch); - idx = sched_find_first_bit(rq->bitmap); - next = list_entry(rq->queues[idx].queue.next, task_t, run_list); - /* - * update prio just in case next has been promoted since it was queued - */ - next->prio = idx; + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; + } else + schedstat_inc(rq, sched_noswitch); + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + + if (!rt_task(next) && next->activated > 0) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - delta_cpu_stats(prev, now); + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0) { + prev->sleep_avg = 0; + if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) + prev->interactive_credit--; + } prev->timestamp = now; sched_info_switch(prev, next); if (likely(prev != next)) { - delta_delay_stats(next, now); next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -3333,7 +2883,9 @@ EXPORT_SYMBOL(sleep_on_timeout); void set_user_nice(task_t *p, long nice) { unsigned long flags; - spinlock_t *rql; + prio_array_t *array; + runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3341,32 +2893,38 @@ void set_user_nice(task_t *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rql = task_rq_lock(p, &flags); - - p->static_prio = NICE_TO_PRIO(nice); - p->eb_shares = nice_to_shares(nice); + rq = task_rq_lock(p, &flags); /* * The RT priorities are set via setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is * not SCHED_NORMAL: */ - if (!rt_task(p) && task_queued(p)) { - int delta = -p->prio; + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; - dequeue_task(p); - calculate_pre_bonus_priority(p); - delta += p->prio = effective_prio(p); - enqueue_task(p); + if (array) { + enqueue_task(p, array); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ - if (delta < 0 || (delta > 0 && task_is_running(p))) - resched_task(p->rq->curr); + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); } - - task_rq_unlock(rql, &flags); +out_unlock: + task_rq_unlock(rq, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -3422,158 +2980,6 @@ asmlinkage long sys_nice(int increment) #endif -/* - * Require: 0 <= new_cap <= PROPORTION_ONE - */ -int set_cpu_rate_cap(struct task_struct *p, unsigned long long new_cap) -{ - int is_allowed; - unsigned long flags; - spinlock_t *rql; - long long delta; - - if (new_cap > PROPORTION_ONE) - return -EINVAL; - is_allowed = capable(CAP_SYS_NICE); - /* - * We have to be careful, if called from /proc code, - * the task might be in the middle of scheduling on another CPU. - */ - rql = task_rq_lock(p, &flags); - delta = new_cap - p->cpu_rate_cap; - if (!is_allowed) { - /* - * Ordinary users can set/change caps on their own tasks provided - * that the new setting is MORE constraining - */ - if (((current->euid != p->uid) && (current->uid != p->uid)) || (delta > 0)) { - task_rq_unlock(rql, &flags); - return -EPERM; - } - } - /* - * The RT tasks don't have caps, but we still allow the caps to be - * set - but as expected it wont have any effect on scheduling until the - * task becomes SCHED_NORMAL: - */ - p->cpu_rate_cap = new_cap; - if (!rt_task(p) && task_queued(p)) { - int delta = -p->prio; - - dequeue_task(p); - calculate_pre_bonus_priority(p); - delta += p->prio = effective_prio(p); - enqueue_task(p); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_is_running(p))) - resched_task(p->rq->curr); - } - task_rq_unlock(rql, &flags); - return 0; -} - -EXPORT_SYMBOL(set_cpu_rate_cap); - -/* - * Require: 1 <= new_cap <= PROPORTION_ONE - */ -int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long long new_cap) -{ - int is_allowed; - unsigned long flags; - spinlock_t *rql; - long long delta; - - if ((new_cap > PROPORTION_ONE) || (new_cap == 0)) /* zero hard caps are not allowed */ - return -EINVAL; - is_allowed = capable(CAP_SYS_NICE); - /* - * We have to be careful, if called from /proc code, - * the task might be in the middle of scheduling on another CPU. - */ - rql = task_rq_lock(p, &flags); - delta = new_cap - p->cpu_rate_hard_cap; - if (!is_allowed) { - /* - * Ordinary users can set/change caps on their own tasks provided - * that the new setting is MORE constraining - */ - if (((current->euid != p->uid) && (current->uid != p->uid)) || (delta > 0)) { - task_rq_unlock(rql, &flags); - return -EPERM; - } - } - /* - * The RT tasks don't have caps, but we still allow the caps to be - * set - but as expected it wont have any effect on scheduling until the - * task becomes SCHED_NORMAL: - */ - p->cpu_rate_hard_cap = new_cap; - /* (POSSIBLY) TODO: if it's sinbinned and the cap is relaxed then release - * it from the sinbin - */ - task_rq_unlock(rql, &flags); - return 0; -} - -EXPORT_SYMBOL(set_cpu_rate_hard_cap); - -int set_cpu_shares(task_t *p, unsigned int new_shares) -{ - int is_allowed; - int result = 0; - unsigned long flags; - spinlock_t *rql; - - if (p->eb_shares == new_shares) - return 0; - - if ((new_shares < 1) || (new_shares > MAX_EB_SHARES)) - return -EINVAL; - - is_allowed = capable(CAP_SYS_NICE); - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rql = task_rq_lock(p, &flags); - if (!is_allowed && (new_shares > p->eb_shares)) { - result = -EPERM; - goto out_unlock; - } - p->static_prio = NICE_TO_PRIO(shares_to_nice(new_shares)); - p->eb_shares = new_shares; - /* - * The RT priorities are set via setscheduler(), but we still - * allow eb_shares value to be set - but as expected - * it wont have any effect on scheduling until the task is - * not SCHED_NORMAL: - */ - if (!rt_task(p) && task_queued(p)) { - int delta = -p->prio; - - dequeue_task(p); - calculate_pre_bonus_priority(p); - delta += p->prio = effective_prio(p); - enqueue_task(p); - /* - * If the task decreased its prio or is running and - * increased its prio, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_is_running(p))) - resched_task(p->rq->curr); - } -out_unlock: - task_rq_unlock(rql, &flags); - - return result; -} - -EXPORT_SYMBOL(set_cpu_shares); - /** * task_prio - return the priority value of a given task. * @p: the task in question. @@ -3621,7 +3027,7 @@ static inline task_t *find_process_by_pi /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(task_queued(p)); + BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) @@ -3638,9 +3044,9 @@ static int setscheduler(pid_t pid, int p struct sched_param lp; int retval = -EINVAL; int oldprio; - int queued; + prio_array_t *array; unsigned long flags; - spinlock_t *rql; + runqueue_t *rq; task_t *p; if (!param || pid < 0) @@ -3665,7 +3071,7 @@ static int setscheduler(pid_t pid, int p * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ - rql = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); if (policy < 0) policy = p->policy; @@ -3689,42 +3095,38 @@ static int setscheduler(pid_t pid, int p retval = -EPERM; if ((policy == SCHED_FIFO || policy == SCHED_RR) && - !capable(CAP_SYS_NICE)) { - if (current->euid == p->uid) - p->flags |= PF_UNPRIV_RT; + !capable(CAP_SYS_NICE)) goto out_unlock; - } if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; - if (policy == SCHED_NORMAL) - p->flags &= ~PF_UNPRIV_RT; retval = security_task_setscheduler(p, policy, &lp); if (retval) goto out_unlock; - if ((queued = task_queued(p))) - deactivate_task(p); + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); retval = 0; oldprio = p->prio; __setscheduler(p, policy, lp.sched_priority); - if (queued) { - __activate_task(p); + if (array) { + __activate_task(p, task_rq(p)); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_is_running(p)) { + if (task_running(rq, p)) { if (p->prio > oldprio) - resched_task(p); - } else - preempt_curr_if_warranted(p); + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); } out_unlock: - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); out_unlock_tasklist: read_unlock_irq(&tasklist_lock); @@ -3954,130 +3356,45 @@ asmlinkage long sys_sched_getaffinity(pi return sizeof(cpumask_t); } -void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats) -{ - int on_runq = 0; - int on_cpu = 0; - int sinbinned = 0; - unsigned long long timestamp; - unsigned long flags; - spinlock_t *rql = task_rq_lock(tsk, &flags); - - stats->timestamp = tsk->rq->timestamp_last_tick; - stats->cycle_count = tsk->cycle_count; - stats->total_sleep = tsk->total_sleep; - stats->total_cpu = tsk->total_cpu; - stats->total_delay = tsk->total_delay; - stats->total_sinbin = tsk->total_sinbin; - stats->intr_wake_ups = tsk->intr_wake_ups; - timestamp = tsk->sched_timestamp; - if ((on_runq = task_queued(tsk))) - on_cpu = task_is_running(tsk); - else - sinbinned = task_is_sinbinned(tsk); - - task_rq_unlock(rql, &flags); - - /* - * Update values to the previous tick (only) - */ - if (stats->timestamp > timestamp) { - unsigned long long delta = stats->timestamp - timestamp; - - if (on_cpu) { - stats->total_cpu += delta; - } else if (on_runq || sinbinned) { - stats->total_delay += delta; - if (sinbinned) - stats->total_sinbin += delta; - } else { - stats->total_sleep += delta; - } - } -} - -EXPORT_SYMBOL(get_task_sched_stats); - -/* - * Get scheduling statistics for the nominated CPU - */ -void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats) -{ - int idle; - unsigned long long idle_timestamp; - runqueue_t *rq = cpu_rq(cpu); - - /* - * No need to crash the whole machine if they've asked for stats for - * a non existent CPU, just send back zero. - */ - if (rq == NULL) { - stats->timestamp = 0; - stats->total_idle = 0; - stats->total_busy = 0; - stats->total_delay = 0; - stats->total_sinbin = 0; - stats->nr_switches = 0; - - return; - } - local_irq_disable(); - spin_lock(&rq->lock); - idle = rq->curr == rq->idle; - stats->timestamp = rq->timestamp_last_tick; - idle_timestamp = rq->idle->sched_timestamp; - stats->total_idle = rq->idle->total_cpu; - stats->total_busy = rq->idle->total_delay; - stats->total_delay = rq->total_delay; - stats->total_sinbin = rq->total_sinbin; - stats->nr_switches = rq->nr_switches; - spin_unlock_irq(&rq->lock); - - /* - * Update idle/busy time to the current tick - */ - if (idle) - stats->total_idle += (stats->timestamp - idle_timestamp); - else - stats->total_busy += (stats->timestamp - idle_timestamp); -} - -EXPORT_SYMBOL(get_cpu_sched_stats); - /** * sys_sched_yield - yield the current processor to other threads. * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this * CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { - spinlock_t *rql = this_rq_lock(); + runqueue_t *rq = this_rq_lock(); + prio_array_t *array = current->array; + prio_array_t *target = rq->expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->active; + + if (current->array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->expired->nr_active) + schedstat_inc(rq, yld_exp_empty); - schedstat_inc(current->rq, yld_cnt); - /* If there's other tasks on this CPU make sure that at least - * one of them get some CPU before this task's next bite of the - * cherry. Dequeue before looking for the appropriate run - * queue so that we don't find our queue if we were the sole - * occupant of that queue. - */ - dequeue_task(current); - /* - * special rule: RT tasks will just roundrobin. - */ - if (likely(!rt_task(current))) { - int idx = find_next_bit(current->rq->bitmap, IDLE_PRIO, current->prio); - if (idx < IDLE_PRIO) - current->prio = idx; - } - enqueue_task(current); - if (current->rq->nr_running == 1) - schedstat_inc(current->rq, yld_both_empty); + dequeue_task(current, array); + enqueue_task(current, target); /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - _raw_spin_unlock(rql); + _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); schedule(); @@ -4381,25 +3698,15 @@ void __devinit init_idle(task_t *idle, i runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->prio = IDLE_PRIO; - /* - * Initialize scheduling statistics counters as they may provide - * valuable about the CPU e.g. avg_cpu_time_per_cycle for the idle - * task will be an estimate of the average time the CPU is idle - */ - initialize_stats(idle); - initialize_bonuses(idle); + idle->sleep_avg = 0; + idle->interactive_credit = 0; + idle->array = NULL; + idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; - idle->sched_timestamp = adjusted_sched_clock(idle); - /* - * Putting the idle process onto a run queue simplifies the selection of - * the next task to run in schedule(). - */ - enqueue_task(idle); set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); @@ -4451,11 +3758,11 @@ int set_cpus_allowed(task_t *p, cpumask_ unsigned long flags; int ret = 0; migration_req_t req; - spinlock_t *rql; + runqueue_t *rq; perfctr_set_cpus_allowed(p, new_mask); - rql = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); if (!cpus_intersects(new_mask, cpu_online_map)) { ret = -EINVAL; goto out; @@ -4468,14 +3775,14 @@ int set_cpus_allowed(task_t *p, cpumask_ if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rql, &flags); - wake_up_process(p->rq->migration_thread); + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; } out: - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); return ret; } @@ -4508,25 +3815,21 @@ static void __migrate_task(struct task_s if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; - if (task_queued(p)) { - /* - * Don't do set_task_cpu() until AFTER we dequeue the task, - * since dequeue_task() relies on p->rq always being accurate. - */ - deactivate_task(p); - delta_delay_stats(p, adjusted_sched_clock(p)); - set_task_cpu(p, dest_cpu); + set_task_cpu(p, dest_cpu); + if (p->array) { /* - * activate_task() will set the timestamp correctly so there's - * no need to adjust it here + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. */ - activate_task(p); - preempt_curr_if_warranted(p); - } else { - delta_sleep_stats(p, adjusted_sched_clock(p)); - set_task_cpu(p, dest_cpu); + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); } - adjust_sched_timestamp(p, rq_src); out: double_rq_unlock(rq_src, rq_dest); @@ -4676,10 +3979,9 @@ void sched_idle_next(void) */ spin_lock_irqsave(&rq->lock, flags); - dequeue_task(p); __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); /* Add idle task to _front_ of it's priority queue */ - __activate_task_head(p); + __activate_idle_task(p, rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -4711,13 +4013,17 @@ static void migrate_dead(unsigned int de /* release_task() removes task from tasklist, so we won't find dead tasks. */ static void migrate_dead_tasks(unsigned int dead_cpu) { - unsigned i; + unsigned arr, i; struct runqueue *rq = cpu_rq(dead_cpu); - for (i = 0; i < IDLE_PRIO; i++) { - struct list_head *list = &rq->queues[i].queue; - while (!list_empty(list)) - migrate_dead(dead_cpu, list_entry(list->next, task_t, run_list)); + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < MAX_PRIO; i++) { + struct list_head *list = &rq->arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } } } #endif /* CONFIG_HOTPLUG_CPU */ @@ -4731,10 +4037,7 @@ static int migration_call(struct notifie { int cpu = (long)hcpu; struct task_struct *p; -#ifdef CONFIG_HOTPLUG_CPU struct runqueue *rq; -#endif - spinlock_t *rql; unsigned long flags; switch (action) { @@ -4745,9 +4048,9 @@ static int migration_call(struct notifie p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ - rql = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; case CPU_ONLINE: @@ -4766,14 +4069,13 @@ static int migration_call(struct notifie rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); rq->migration_thread = NULL; - /* Idle task back to normal in IDLE_PRIO slot */ - rql = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle); - rq->idle->static_prio = IDLE_PRIO; + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); + rq->idle->static_prio = MAX_PRIO; __setscheduler(rq->idle, SCHED_NORMAL, 0); - enqueue_task(rq->idle); migrate_dead_tasks(cpu); - task_rq_unlock(rql, &flags); + task_rq_unlock(rq, &flags); BUG_ON(rq->nr_running != 0); /* No need to migrate the tasks: it was best-effort if @@ -5254,11 +4556,16 @@ int in_sched_functions(unsigned long add void __init sched_init(void) { runqueue_t *rq; - int i, k; + int i, j, k; for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + rq = cpu_rq(i); spin_lock_init(&rq->lock); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_dummy; @@ -5270,24 +4577,16 @@ void __init sched_init(void) #endif atomic_set(&rq->nr_iowait, 0); - for (k = 0; k <= IDLE_PRIO; k++) { - rq->queues[k].prio = k; - INIT_LIST_HEAD(&rq->queues[k].queue); + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); } - bitmap_zero(rq->bitmap, NUM_PRIO_SLOTS); - /* delimiter for bitsearch */ - __set_bit(IDLE_PRIO, rq->bitmap); - rq->timestamp_last_tick = 0; - rq->next_prom_due = ULONG_MAX; - rq->pcount = 0; - rq->total_delay = 0; - rq->eb_yardstick = 0; - rq->eb_ticks_to_decay = time_slice_ticks; - rq->avg_nr_running = 0; - rq->total_sinbin = 0; } - current->rq = this_rq(); - current->sched_timestamp = sched_clock(); /* * The boot idle thread does lazy MMU switching as well: @@ -5325,256 +4624,3 @@ void __might_sleep(char *file, int line) } EXPORT_SYMBOL(__might_sleep); #endif - -#if defined(CONFIG_SYSCTL) -/* - * CPU scheduler control via /proc/sys/cpusched/xxx - */ -enum -{ - CPU_SCHED_END_OF_LIST=0, - CPU_SCHED_TIME_SLICE=1, - CPU_SCHED_SCHED_RR_TIME_SLICE, - CPU_SCHED_BASE_PROMOTION_INTERVAL, - CPU_SCHED_MAX_IA_BONUS, - CPU_SCHED_MAX_TPT_BONUS, - CPU_SCHED_IA_THRESHOLD, - CPU_SCHED_CPU_HOG_THRESHOLD, - CPU_SCHED_LOG_AT_EXIT, - CPU_SCHED_MODE, - CPU_SCHED_INITIAL_IA_BONUS, - CPU_SCHED_UNPRIV_RT_THRESHOLD, - CPU_SCHED_BGND_TIME_SLICE_MULTIPLIER -}; - -static const unsigned int zero = 0; -static const unsigned int one = 1; -#define min_milli_value zero -static const unsigned int max_milli_value = 1000; -#define min_max_ia_bonus zero -static const unsigned int max_max_ia_bonus = MAX_MAX_IA_BONUS; -#define min_max_tpt_bonus zero -static const unsigned int max_max_tpt_bonus = MAX_MAX_TPT_BONUS; -static unsigned int time_slice_msecs = DEFAULT_TIME_SLICE_MSECS; -static unsigned int sched_rr_time_slice_msecs = DEFAULT_TIME_SLICE_MSECS; -#define min_time_slice_msecs one -static const unsigned int max_time_slice_msecs = MAX_TIME_SLICE_MSECS; -static unsigned int base_prom_interval_msecs = BASE_PROM_INTERVAL_MSECS; -#define min_base_prom_interval_msecs one -static const unsigned int max_base_prom_interval_msecs = INT_MAX; -#define min_sched_bgnd_time_slice_multiplier one -static const unsigned int max_sched_bgnd_time_slice_multiplier = 100; - -static int proc_time_slice_msecs(ctl_table *ctp, int write, struct file *fp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos); - - if ((res == 0) && write) - time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(time_slice_msecs); - - return res; -} - -static int proc_sched_rr_time_slice_msecs(ctl_table *ctp, int write, struct file *fp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos); - - if ((res == 0) && write) - sched_rr_time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(sched_rr_time_slice_msecs); - - return res; -} - -static int proc_base_prom_interval_msecs(ctl_table *ctp, int write, struct file *fp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos); - - if ((res == 0) && write) - base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(base_prom_interval_msecs); - - return res; -} - -static int proc_cpu_hog_threshold(ctl_table *ctp, int write, struct file *fp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos); - - if ((res == 0) && write) - cpu_hog_threshold = calc_proportion(cpu_hog_threshold_ppt, 1000); - - return res; -} - -static int proc_ia_threshold(ctl_table *ctp, int write, struct file *fp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos); - - if ((res == 0) && write) - ia_threshold = calc_proportion(ia_threshold_ppt, 1000); - - return res; -} - -static int proc_unpriv_rt_threshold(ctl_table *ctp, int write, struct file *fp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos); - - if ((res == 0) && write) - unpriv_rt_threshold = calc_proportion(unpriv_rt_threshold_ppt, 1000); - - return res; -} - -#define SCHED_MODE_BUFFER_LEN 16 -static char current_sched_mode[SCHED_MODE_BUFFER_LEN] = ""; -static int proc_sched_mode(ctl_table *ctp, int write, struct file *fp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int res; - - strcpy(current_sched_mode, sched_mode_names[sched_mode]); - res = proc_dostring(ctp, write, fp, buffer, lenp, ppos); - - if ((res == 0) && write) { - int i; - - for (i = 0; sched_mode_names[i] != NULL; i++) - if (strcmp(current_sched_mode, sched_mode_names[i]) == 0) - break; - if (sched_mode_names[i] == NULL) - res = -EINVAL; - else /* set the scheduling mode */ - sched_mode = i; - } - - return res; -} - -ctl_table cpu_sched_table[] = { - { - .ctl_name = CPU_SCHED_TIME_SLICE, - .procname = "time_slice", - .data = &time_slice_msecs, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_time_slice_msecs, - .extra1 = (void *)&min_time_slice_msecs, - .extra2 = (void *)&max_time_slice_msecs - }, - { - .ctl_name = CPU_SCHED_SCHED_RR_TIME_SLICE, - .procname = "sched_rr_time_slice", - .data = &sched_rr_time_slice_msecs, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_sched_rr_time_slice_msecs, - .extra1 = (void *)&min_time_slice_msecs, - .extra2 = (void *)&max_time_slice_msecs - }, - { - .ctl_name = CPU_SCHED_BASE_PROMOTION_INTERVAL, - .procname = "base_promotion_interval", - .data = &base_prom_interval_msecs, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_base_prom_interval_msecs, - .extra1 = (void *)&min_base_prom_interval_msecs, - .extra2 = (void *)&max_base_prom_interval_msecs - }, - { - .ctl_name = CPU_SCHED_MAX_IA_BONUS, - .procname = "max_ia_bonus", - .data = &max_ia_bonus, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = (void *)&min_max_ia_bonus, - .extra2 = (void *)&max_max_ia_bonus - }, - { - .ctl_name = CPU_SCHED_INITIAL_IA_BONUS, - .procname = "initial_ia_bonus", - .data = &initial_ia_bonus, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = (void *)&min_max_ia_bonus, - .extra2 = (void *)&max_max_ia_bonus - }, - { - .ctl_name = CPU_SCHED_MAX_TPT_BONUS, - .procname = "max_tpt_bonus", - .data = &max_tpt_bonus, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = (void *)&min_max_tpt_bonus, - .extra2 = (void *)&max_max_tpt_bonus - }, - { - .ctl_name = CPU_SCHED_IA_THRESHOLD, - .procname = "ia_threshold", - .data = &ia_threshold_ppt, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_ia_threshold, - .extra1 = (void *)&min_milli_value, - .extra2 = (void *)&max_milli_value - }, - { - .ctl_name = CPU_SCHED_CPU_HOG_THRESHOLD, - .procname = "cpu_hog_threshold", - .data = &cpu_hog_threshold_ppt, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_cpu_hog_threshold, - .extra1 = (void *)&min_milli_value, - .extra2 = (void *)&max_milli_value - }, - { - .ctl_name = CPU_SCHED_UNPRIV_RT_THRESHOLD, - .procname = "unpriv_rt_threshold", - .data = &unpriv_rt_threshold_ppt, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_unpriv_rt_threshold, - .extra1 = (void *)&min_milli_value, - .extra2 = (void *)&max_milli_value - }, - { - .ctl_name = CPU_SCHED_LOG_AT_EXIT, - .procname = "log_at_exit", - .data = &log_at_exit, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = (void *)&zero, - .extra2 = (void *)&one - }, - { - .ctl_name = CPU_SCHED_MODE, - .procname = "mode", - .data = ¤t_sched_mode, - .maxlen = SCHED_MODE_BUFFER_LEN, - .mode = 0644, - .proc_handler = &proc_sched_mode, - }, - { - .ctl_name = CPU_SCHED_BGND_TIME_SLICE_MULTIPLIER, - .procname = "bgnd_time_slice_multiplier", - .data = &bgnd_time_slice_multiplier, - .maxlen = sizeof (unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = (void *)&min_sched_bgnd_time_slice_multiplier, - .extra2 = (void *)&max_sched_bgnd_time_slice_multiplier - }, - { .ctl_name = CPU_SCHED_END_OF_LIST } -}; -#endif Index: linux-2.6.9-rc2-mm2/kernel/sysctl.c =================================================================== --- linux-2.6.9-rc2-mm2.orig/kernel/sysctl.c 2004-09-23 09:59:23.611797968 +1000 +++ linux-2.6.9-rc2-mm2/kernel/sysctl.c 2004-09-23 10:00:41.397972672 +1000 @@ -149,10 +149,6 @@ extern ctl_table pty_table[]; #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; #endif -/* - * CPU scheduler control variables (lives in sched.c) - */ -extern ctl_table cpu_sched_table[]; /* /proc declarations: */ @@ -628,12 +624,6 @@ static ctl_table kern_table[] = { .proc_handler = &proc_unknown_nmi_panic, }, #endif - { - .ctl_name = KERN_CPU_SCHED, - .procname = "cpusched", - .mode = 0555, - .child = cpu_sched_table, - }, { .ctl_name = 0 } };