Index: linux-2.6.9-rc2-mm2/fs/proc/array.c
===================================================================
--- linux-2.6.9-rc2-mm2.orig/fs/proc/array.c	2004-09-23 09:59:23.268850104 +1000
+++ linux-2.6.9-rc2-mm2/fs/proc/array.c	2004-09-23 10:00:41.383974800 +1000
@@ -162,6 +162,7 @@ static inline char * task_state(struct t
 	read_lock(&tasklist_lock);
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
+		"SleepAVG:\t%lu%%\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -169,6 +170,7 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
+		(p->sleep_avg/1024)*100/(1020000000/1024),
 	       	p->tgid,
 		p->pid, p->pid ? p->real_parent->pid : 0,
 		p->pid && p->ptrace ? p->parent->pid : 0,
@@ -468,25 +470,3 @@ int proc_pid_statm(struct task_struct *t
 	return sprintf(buffer,"%d %d %d %d %d %d %d\n",
 		       size, resident, shared, text, lib, data, 0);
 }
-
-int task_cpu_sched_stats(struct task_struct *p, char *buffer)
-{
-	struct task_sched_stats stats;
-	unsigned long nvcsw, nivcsw; /* context switch counts */
-
-	read_lock(&tasklist_lock);
-	get_task_sched_stats(p, &stats);
-	nvcsw = p->nvcsw;
-	nivcsw = p-> nivcsw;
-	read_unlock(&tasklist_lock);
-	return sprintf(buffer,
-		"%llu %llu %llu %llu %llu %llu %lu %lu @ %llu\n",
-		stats.total_sleep,
-		stats.total_cpu,
-		stats.total_delay,
-		stats.total_sinbin,
-		stats.cycle_count,
-		stats.intr_wake_ups,
-		nvcsw, nivcsw,
-		stats.timestamp);
-}
Index: linux-2.6.9-rc2-mm2/fs/proc/base.c
===================================================================
--- linux-2.6.9-rc2-mm2.orig/fs/proc/base.c	2004-09-23 09:59:23.269849952 +1000
+++ linux-2.6.9-rc2-mm2/fs/proc/base.c	2004-09-23 10:01:00.311097440 +1000
@@ -96,10 +96,6 @@ enum pid_directory_inos {
 #ifdef CONFIG_CPUSETS
 	PROC_TID_CPUSET,
 #endif
-	PROC_TID_CPU_STATS,
-	PROC_TID_CPU_RATE_CAP,
-	PROC_TID_CPU_RATE_HARD_CAP,
-	PROC_TID_CPU_SHARES,
 #ifdef CONFIG_SECURITY
 	PROC_TID_ATTR,
 	PROC_TID_ATTR_CURRENT,
@@ -174,10 +170,6 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_CPUSETS
 	E(PROC_TID_CPUSET,     "cpuset",  S_IFREG|S_IRUGO),
 #endif
-	E(PROC_TID_CPU_STATS,  "cpustats",   S_IFREG|S_IRUGO),
-	E(PROC_TID_CPU_RATE_CAP,  "cpu_rate_cap",   S_IFREG|S_IRUGO|S_IWUSR),
-	E(PROC_TID_CPU_RATE_HARD_CAP,  "cpu_rate_hard_cap",   S_IFREG|S_IRUGO|S_IWUSR),
-	E(PROC_TID_CPU_SHARES,  "cpu_shares",   S_IFREG|S_IRUGO|S_IWUSR),
 	{0,0,NULL,0}
 };
 
@@ -214,7 +206,6 @@ int proc_tid_stat(struct task_struct*,ch
 int proc_tgid_stat(struct task_struct*,char*);
 int proc_pid_status(struct task_struct*,char*);
 int proc_pid_statm(struct task_struct*,char*);
-extern int task_cpu_sched_stats(struct task_struct *p, char *buffer);
 
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
@@ -586,142 +577,6 @@ static struct file_operations proc_info_
 	.read		= proc_info_read,
 };
 
-static ssize_t cpu_rate_cap_read(struct file * file, char * buf,
-			size_t count, loff_t *ppos)
-{
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[64];
-	size_t len;
-	unsigned long long hcppt = proportion_to_ppt(task->cpu_rate_cap);
-
-	if (*ppos)
-		return 0;
-	*ppos = len = sprintf(buffer, "%llu\n", hcppt);
-	if (copy_to_user(buf, buffer, len))
-		return -EFAULT;
-
-	return len;
-}
-
-static ssize_t cpu_rate_cap_write(struct file * file, const char * buf,
-			 size_t count, loff_t *ppos)
-{
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[128] = "";
-	char *endptr = NULL;
-	unsigned long long hcppt;
-	int res;
-
-
-	if ((count > 63) || *ppos)
-		return -EFBIG;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
-	hcppt = simple_strtoul(buffer, &endptr, 0);
-	if ((endptr == buffer) || (hcppt == ULONG_MAX))
-		return -EINVAL;
-
-	if ((res = set_cpu_rate_cap(task, ppt_to_proportion(hcppt))) != 0)
-		return res;
-
-	return count;
-}
-
-static struct file_operations proc_cpu_rate_cap_operations = {
-	read:		cpu_rate_cap_read,
-	write:		cpu_rate_cap_write,
-};
-
-static ssize_t cpu_rate_hard_cap_read(struct file * file, char * buf,
-			size_t count, loff_t *ppos)
-{
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[64];
-	size_t len;
-	unsigned long long hcppt = proportion_to_ppt(task->cpu_rate_hard_cap);
-
-	if (*ppos)
-		return 0;
-	*ppos = len = sprintf(buffer, "%llu\n", hcppt);
-	if (copy_to_user(buf, buffer, len))
-		return -EFAULT;
-
-	return len;
-}
-
-static ssize_t cpu_rate_hard_cap_write(struct file * file, const char * buf,
-			 size_t count, loff_t *ppos)
-{
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[128] = "";
-	char *endptr = NULL;
-	unsigned long long hcppt;
-	int res;
-
-
-	if ((count > 63) || *ppos)
-		return -EFBIG;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
-	hcppt = simple_strtoul(buffer, &endptr, 0);
-	if ((endptr == buffer) || (hcppt == ULONG_MAX))
-		return -EINVAL;
-
-	if ((res = set_cpu_rate_hard_cap(task, ppt_to_proportion(hcppt))) != 0)
-		return res;
-
-	return count;
-}
-
-static struct file_operations proc_cpu_rate_hard_cap_operations = {
-	read:		cpu_rate_hard_cap_read,
-	write:		cpu_rate_hard_cap_write,
-};
-
-static ssize_t cpu_shares_read(struct file * file, char * buf,
-			size_t count, loff_t *ppos)
-{
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[64];
-	size_t len;
-
-	if (*ppos)
-		return 0;
-	*ppos = len = sprintf(buffer, "%u\n", task->eb_shares);
-	if (copy_to_user(buf, buffer, len))
-		return -EFAULT;
-
-	return len;
-}
-
-static ssize_t cpu_shares_write(struct file * file, const char * buf,
-			 size_t count, loff_t *ppos)
-{
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[64] = "";
-	char *endptr = NULL;
-	unsigned long shares;
-	int res;
-
-	if ((count > 63) || *ppos)
-		return -EFBIG;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
-	shares = simple_strtoul(buffer, &endptr, 0);
-	if ((endptr == buffer) || (shares == ULONG_MAX))
-		return -EINVAL;
-
-	if ((res = set_cpu_shares(task, shares)) != 0)
-		return res;
-
-	return count;
-}
-
-static struct file_operations proc_cpu_shares_operations = {
-	read:		cpu_shares_read,
-	write:		cpu_shares_write,
-};
-
 static int mem_open(struct inode* inode, struct file* file)
 {
 	file->private_data = (void*)((long)current->self_exec_id);
@@ -1540,19 +1395,6 @@ static struct dentry *proc_pident_lookup
 			ei->op.proc_read = proc_pid_schedstat;
 			break;
 #endif
-		case PROC_TID_CPU_STATS:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = task_cpu_sched_stats;
-			break;
-		case PROC_TID_CPU_RATE_CAP:
-			inode->i_fop = &proc_cpu_rate_cap_operations;
-			break;
-		case PROC_TID_CPU_RATE_HARD_CAP:
-			inode->i_fop = &proc_cpu_rate_hard_cap_operations;
-			break;
-		case PROC_TID_CPU_SHARES:
-			inode->i_fop = &proc_cpu_shares_operations;
-			break;
 #ifdef CONFIG_CPUSETS
 		case PROC_TID_CPUSET:
 		case PROC_TGID_CPUSET:
Index: linux-2.6.9-rc2-mm2/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.9-rc2-mm2.orig/fs/proc/proc_misc.c	2004-09-23 09:59:23.270849800 +1000
+++ linux-2.6.9-rc2-mm2/fs/proc/proc_misc.c	2004-09-23 10:00:41.387974192 +1000
@@ -271,40 +271,6 @@ static struct file_operations proc_cpuin
 	.release	= seq_release,
 };
 
-static int cpustats_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int i;
-	int len = 0;
-	struct cpu_sched_stats total = {0, };
-
-	for_each_online_cpu(i) {
-		struct cpu_sched_stats stats;
-
-		get_cpu_sched_stats(i, &stats);
-		len += sprintf(page + len, "cpu%02d %llu %llu %llu %llu %llu @ %llu\n", i,
-		stats.total_idle,
-		stats.total_busy,
-		stats.total_delay,
-		stats.total_sinbin,
-		stats.nr_switches,
-		stats.timestamp);
-		total.total_idle += stats.total_idle;
-		total.total_busy += stats.total_busy;
-		total.total_delay += stats.total_delay;
-		total.total_sinbin += stats.total_sinbin;
-		total.nr_switches += stats.nr_switches;
-	}
-	len += sprintf(page + len, "total %llu %llu %llu %llu %llu\n",
-		total.total_idle,
-		total.total_busy,
-		total.total_delay,
-		total.total_sinbin,
-		total.nr_switches);
-
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 extern struct seq_operations vmstat_op;
 static int vmstat_open(struct inode *inode, struct file *file)
 {
@@ -660,7 +626,6 @@ void __init proc_misc_init(void)
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
-		{"cpustats",	cpustats_read_proc},
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
Index: linux-2.6.9-rc2-mm2/include/linux/init_task.h
===================================================================
--- linux-2.6.9-rc2-mm2.orig/include/linux/init_task.h	2004-09-23 09:59:23.544808152 +1000
+++ linux-2.6.9-rc2-mm2/include/linux/init_task.h	2004-09-23 10:00:41.388974040 +1000
@@ -68,19 +68,11 @@ extern struct group_info init_groups;
 {									\
 	.state		= 0,						\
 	.thread_info	= &init_thread_info,				\
-	.rq		= NULL,						\
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
-	.pre_bonus_priority	= MAX_PRIO-20,					\
-	.eb_shares	= DEFAULT_EB_SHARES,				\
-	.cpu_rate_cap = PROPORTION_ONE,				\
-	.cpu_rate_hard_cap = PROPORTION_ONE,				\
-	.sinbin_timer	= {						\
-		.function = sinbin_release_fn				\
-	},								\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
Index: linux-2.6.9-rc2-mm2/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc2-mm2.orig/include/linux/sched.h	2004-09-23 09:59:23.562805416 +1000
+++ linux-2.6.9-rc2-mm2/include/linux/sched.h	2004-09-23 10:01:40.272022456 +1000
@@ -360,7 +360,7 @@ extern struct user_struct *find_user(uid
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
-typedef struct runqueue runqueue_t;
+typedef struct prio_array prio_array_t;
 struct backing_dev_info;
 struct reclaim_state;
 
@@ -590,30 +590,6 @@ int set_current_groups(struct group_info
 struct audit_context;		/* See audit.c */
 struct mempolicy;
 
-/*
- * For entitlemnet based scheduling a task's shares will be determined from
- * their "nice"ness
- */
-#define EB_SHARES_PER_NICE 5
-#define DEFAULT_EB_SHARES (20 * EB_SHARES_PER_NICE)
-#define MAX_EB_SHARES (DEFAULT_EB_SHARES * DEFAULT_EB_SHARES)
-/*
- * CPU usage rate is estimated as a proportion of a CPU using fixed denominator
- * rational numbers. The denominator must be less than or equal to 2^32
- */
-#define PROPORTION_OFFSET 24
-#define PROPORTION_ONE (1ULL << PROPORTION_OFFSET)
-static inline unsigned long long proportion_to_ppt(unsigned long long proportion)
-{
-	return (proportion * 1000) >> PROPORTION_OFFSET;
-}
-unsigned long long ppt_to_proportion(unsigned long long ppt);
-int set_cpu_rate_cap(struct task_struct *p, unsigned long long new_cap);
-int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long long new_cap);
-int set_cpu_shares(struct task_struct *p, unsigned int new_shares);
-
-void sinbin_release_fn(unsigned long arg);
-
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -625,28 +601,16 @@ struct task_struct {
 
 	int prio, static_prio;
 	struct list_head run_list;
-	runqueue_t *rq;
+	prio_array_t *array;
 
+	unsigned long sleep_avg;
+	long interactive_credit;
 	unsigned long long timestamp;
-
-	unsigned long long sched_timestamp;
-	unsigned long long avg_sleep_per_cycle;
-	unsigned long long avg_delay_per_cycle;
-	unsigned long long avg_cpu_per_cycle;
-	unsigned long interactive_bonus, throughput_bonus;
-	unsigned long long cycle_count, total_sleep, total_cpu, total_delay;
-	unsigned long long sleepiness, cpu_usage_rate;
-	unsigned int pre_bonus_priority;
-	unsigned int eb_shares;
-	unsigned long long intr_wake_ups;
-	unsigned long long cpu_rate_cap;
-	unsigned long long cpu_rate_hard_cap;
-	unsigned long long total_sinbin;
-	struct timer_list sinbin_timer;
+	int activated;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice;
+	unsigned int time_slice, first_time_slice;
 
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
@@ -826,50 +790,8 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
-#define PF_UISLEEP	0x00400000	/* Uninterruptible sleep */
-#define PF_SINBINNED	0x00800000	/* I am sinbinned */
-#define PF_UNPRIV_RT	0x01000000	/* I wanted to be RT but had insufficient privilege*/
 #define PF_BORROWED_MM	0x02000000	/* I am a kthread doing use_mm */
 
-/*
- * Scheduling statistics for a task/thread
- */
-struct task_sched_stats {
-	unsigned long long timestamp;
-	unsigned long long cycle_count;
-	unsigned long long total_sleep;
-	unsigned long long total_cpu;
-	unsigned long long total_delay;
-	unsigned long long total_sinbin;
-	unsigned long long intr_wake_ups;
-};
-
-/*
- * Get "up to date" scheduling statistics for the given task
- * This function should be used if reliable scheduling statistitcs are required
- * outside the scheduler itself as the relevant fields in the task structure
- * are not "up to date" NB the possible difference between those in the task
- * structure and the correct values could be quite large for sleeping tasks.
- */
-extern void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats);
-
-/*
- * Scheduling statistics for a CPU
- */
-struct cpu_sched_stats {
-	unsigned long long timestamp;
-	unsigned long long total_idle;
-	unsigned long long total_busy;
-	unsigned long long total_delay;
-	unsigned long long total_sinbin;
-	unsigned long long nr_switches;
-};
-
-/*
- * Get scheduling statistics for the nominated CPU
- */
-extern void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats);
-
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
 #else
@@ -1252,7 +1174,10 @@ static inline unsigned int task_cpu(cons
 	return p->thread_info->cpu;
 }
 
-void set_task_cpu(struct task_struct *p, unsigned int cpu);
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	p->thread_info->cpu = cpu;
+}
 
 #else
 
Index: linux-2.6.9-rc2-mm2/include/linux/sysctl.h
===================================================================
--- linux-2.6.9-rc2-mm2.orig/include/linux/sysctl.h	2004-09-23 09:59:23.566804808 +1000
+++ linux-2.6.9-rc2-mm2/include/linux/sysctl.h	2004-09-23 10:00:41.389973888 +1000
@@ -134,7 +134,6 @@ enum
 	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
 	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
-	KERN_CPU_SCHED=67,	/* CPU scheduler stuff */
 };
 
 
Index: linux-2.6.9-rc2-mm2/kernel/sched.c
===================================================================
--- linux-2.6.9-rc2-mm2.orig/kernel/sched.c	2004-09-23 09:59:23.607798576 +1000
+++ linux-2.6.9-rc2-mm2/kernel/sched.c	2004-09-23 10:00:41.396972824 +1000
@@ -16,12 +16,6 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2004-06-03	Single priority array, simplified interactive bonus
- *		mechanism, throughput bonus mechanism, hard and soft
- * 		caps and entitlement based mode by Peter Williams
- *		(Courtesy of Aurema Pty Ltd, www.aurema.com)
- *  2004-08-19 Unprivileged RT mode tasks (based on Con Kolivas's
- * 		SCHED_FIFO scheduler class) by Peter Williams
  */
 
 #include <linux/mm.h>
@@ -52,25 +46,14 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
-#include <linux/sysctl.h>
-
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
 
-enum sched_mode_enum {
-	SCHED_MODE_PRIORITY_BASED,
-	SCHED_MODE_ENTITLEMENT_BASED
-};
-
-static enum sched_mode_enum sched_mode = SCHED_MODE_PRIORITY_BASED;
-
-#ifdef CONFIG_SYSCTL
-static const char *sched_mode_names[] = {
-	"pb",		/* SCHED_MODE_PRIORITY_BASED */
-	"eb",		/* SCHED_MODE_ENTITLEMENT_BASED */
-	NULL		/* end of list marker */
-};
+#ifdef CONFIG_NUMA
+#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
+#else
+#define cpu_to_node_mask(cpu) (cpu_online_map)
 #endif
 
 /*
@@ -92,267 +75,128 @@ static const char *sched_mode_names[] = 
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
 /*
- * These are the 'tuning knobs' of the scheduler:
- * Making IDLE_PRIO bigger than 159 would require modification of bitmaps
- */
-#define IDLE_PRIO 159
-#define BGND_PRIO (IDLE_PRIO - 1)
-#define MIN_NORMAL_PRIO (MAX_RT_PRIO + 1)
-#define MAX_TOTAL_BONUS (BGND_PRIO - MAX_PRIO - 1)
-#define MAX_MAX_IA_BONUS ((MAX_TOTAL_BONUS + 1) / 2)
-#define MAX_MAX_TPT_BONUS (MAX_TOTAL_BONUS - MAX_MAX_IA_BONUS)
-#define DEFAULT_MAX_IA_BONUS ((MAX_MAX_IA_BONUS < 7) ? MAX_MAX_IA_BONUS : 7)
-#define DEFAULT_MAX_TPT_BONUS ((DEFAULT_MAX_IA_BONUS - 2) ? : 1)
-static unsigned int max_ia_bonus = DEFAULT_MAX_IA_BONUS;
-static unsigned int initial_ia_bonus = 1;
-static unsigned int max_tpt_bonus = DEFAULT_MAX_TPT_BONUS;
-
-/*
- * Define some mini Kalman filter for estimating various averages, etc.
- * To make it more efficient the denominator of the fixed point rational
- * numbers used to store the averages and the response half life will
- * be chosen so that the fixed point rational number reperesentation
- * of (1 - alpha) * i (where i is an integer) will be i.
- * Some of this is defined in linux/sched.h
- */
-
-/*
- * Fixed denominator rational numbers for use by the CPU scheduler
+ * Some helpers for converting nanosecond timing to jiffy resolution
  */
-#define SCHED_AVG_OFFSET 4
-/*
- * Get the rounded integer value of a scheduling statistic average field
- * i.e. those fields whose names begin with avg_
- */
-#define SCHED_AVG_RND(x) \
-	(((x) + (1 << (SCHED_AVG_OFFSET - 1))) >> (SCHED_AVG_OFFSET))
-#define SCHED_AVG_ALPHA ((1 << SCHED_AVG_OFFSET) - 1)
-#define SCHED_AVG_ONE (1UL << SCHED_AVG_OFFSET)
-#define SCHED_AVG_MUL(a, b) (((a) * (b)) >> SCHED_AVG_OFFSET)
-#define SCHED_AVG_REAL(a) ((a) << SCHED_AVG_OFFSET)
-
-/*
- * Convert nice to shares
- * Proportional symmetry is aimed for: i.e.
- * (nice_to_shares(0) / nice_to_shares(19)) == (nice_to_shares(-20) / nice_to_shares(0))
- * Make sure that this function is robust for variations of EB_SHARES_PER_NICE
- */
-static inline unsigned int nice_to_shares(int nice)
-{
-	unsigned int result = DEFAULT_EB_SHARES;
-
-	if (nice > 0)
-		result -= (nice * (20 * EB_SHARES_PER_NICE - 1)) / 19;
-	else if (nice < 0)
-		result += (nice * nice * ((20 * EB_SHARES_PER_NICE - 1) * EB_SHARES_PER_NICE)) / 20;
-
-	return result;
-}
-
-static inline int shares_to_nice(unsigned int shares)
-{
-	int result = 0;
-
-	if (shares > DEFAULT_EB_SHARES)
-		result = -int_sqrt((20 * (shares - DEFAULT_EB_SHARES)) /
-			(EB_SHARES_PER_NICE * (20 * EB_SHARES_PER_NICE - 1)));
-	else if (shares < DEFAULT_EB_SHARES)
-		result = (19 * (DEFAULT_EB_SHARES - shares)) /
-			 (20 * EB_SHARES_PER_NICE - 1);
-
-	return result;
-}
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 
-#define SCHED_IA_BONUS_OFFSET 8
-#define SCHED_IA_BONUS_ALPHA ((1 << SCHED_IA_BONUS_OFFSET) - 1)
-#define SCHED_IA_BONUS_MUL(a, b) (((a) * (b)) >> SCHED_IA_BONUS_OFFSET)
 /*
- * Get the rounded integer value of the interactive bonus
- */
-#define SCHED_IA_BONUS_RND(x) \
-	(((x) + (1 << (SCHED_IA_BONUS_OFFSET - 1))) >> (SCHED_IA_BONUS_OFFSET))
-
-static inline void apply_sched_avg_decay(unsigned long long *valp)
-{
-	*valp *= SCHED_AVG_ALPHA;
-	*valp >>= SCHED_AVG_OFFSET;
-}
-
-static inline unsigned long long sched_div_64(unsigned long long a, unsigned long long b)
-{
-#if BITS_PER_LONG < 64
-	/*
-	 * Assume that there's no 64 bit divide available
-	 */
-	if (a < b)
-		return 0;
-	/*
-	 * Scale down until b less than 32 bits so that we can do
-	 * a divide using do_div()
-	 */
-	while (b > ULONG_MAX) { a >>= 1; b >>= 1; }
-
-	(void)do_div(a, (unsigned long)b);
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
+ * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * Timeslices get refilled after they expire.
+ */
+#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+#define DEF_TIMESLICE		(100 * HZ / 1000)
+#define ON_RUNQUEUE_WEIGHT	 30
+#define CHILD_PENALTY		 95
+#define PARENT_PENALTY		100
+#define EXIT_WEIGHT		  3
+#define PRIO_BONUS_RATIO	 25
+#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+#define INTERACTIVE_DELTA	  2
+#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+#define CREDIT_LIMIT		100
+
+/*
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ *  priority range a task can explore, a value of '1' means the
+ *  task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
+ */
+
+#define CURRENT_BONUS(p) \
+	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+		MAX_SLEEP_AVG)
 
-	return a;
+#ifdef CONFIG_SMP
+#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+			num_online_cpus())
 #else
-	return a / b;
+#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 #endif
-}
 
-#define PROPORTION_OFFSET 24
-#if PROPORTION_OFFSET > 32
-#error "PROPORTION_OFFSET must be less than or equal to 32"
-#endif
-#define PROPORTION_OVERFLOW ((1ULL << (64 - PROPORTION_OFFSET)) - 1)
-#define PROP_FM_PPT(a) (((unsigned long long)(a) * PROPORTION_ONE) / 1000)
-unsigned long long ppt_to_proportion(unsigned long long ppt)
-{
-	return sched_div_64(ppt * PROPORTION_ONE, 1000);
-}
-/*
- * Convert a / b to a proportion in the range 0 to PROPORTION_ONE
- * Requires a <= b or may get a divide by zero exception
- */
-static inline unsigned long long calc_proportion(unsigned long long a, unsigned long long b)
-{
-	if (unlikely(a == b))
-		return PROPORTION_ONE;
+#define SCALE(v1,v1_max,v2_max) \
+	(v1) * (v2_max) / (v1_max)
 
-	while (a > PROPORTION_OVERFLOW) { a >>= 1; b >>= 1; }
+#define DELTA(p) \
+	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
 
-	return sched_div_64(a << PROPORTION_OFFSET, b);
-}
+#define TASK_INTERACTIVE(p) \
+	((p)->prio <= (p)->static_prio - DELTA(p))
 
-/*
- * Map the given proportion to an unsigned long long in the specified range
- * Requires range < PROPORTION_ONE to avoid overflow
- */
-static inline unsigned long long map_proportion(unsigned long long prop, unsigned long long range)
-{
-	return (prop * range) >> PROPORTION_OFFSET;
-}
-
-static inline unsigned long long map_proportion_rnd(unsigned long long prop, unsigned long long range)
-{
-	return map_proportion((prop >> 1), (range * 2 + 1));
-}
-
-/*
- * Find the square root of a proportion
- * Require: x <= PROPORTION_ONE
- */
-static unsigned long long proportion_sqrt(unsigned long long x)
-{
-	unsigned long long res, b;
-	int bshift;
+#define INTERACTIVE_SLEEP(p) \
+	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
-	/*
-	 * Take shortcut AND prevent overflow
-	 */
-	if (x == PROPORTION_ONE)
-		return PROPORTION_ONE;
+#define HIGH_CREDIT(p) \
+	((p)->interactive_credit > CREDIT_LIMIT)
 
-	res = 0;
-	b = (1UL << (PROPORTION_OFFSET - 1));
-	bshift = PROPORTION_OFFSET - 1;
-	x <<= PROPORTION_OFFSET;
+#define LOW_CREDIT(p) \
+	((p)->interactive_credit < -CREDIT_LIMIT)
 
-	for (; x && b; b >>= 1, bshift--) {
-		unsigned long long temp = (((res << 1) + b) << bshift);
-
-		if (x >= temp) {
-			res += b;
-                        x -= temp;
-		}
-        }
-
-	return res;
-}
-
-/*
- * Tasks that have a CPU usage rate greater than this threshold (in parts per
- * thousand) are considered to be CPU bound and start to lose interactive bonus
- * points
- */
-#define DEFAULT_CPU_HOG_THRESHOLD 900
-static unsigned int cpu_hog_threshold_ppt = DEFAULT_CPU_HOG_THRESHOLD;
-static unsigned long long cpu_hog_threshold = PROP_FM_PPT(DEFAULT_CPU_HOG_THRESHOLD);
-
-/*
- * Tasks that would sleep for more than 900 parts per thousand of the time if
- * they had the CPU to themselves are considered to be interactive provided
- * that their average sleep duration per scheduling cycle isn't too long
- */
-#define DEFAULT_IA_THRESHOLD 900
-static unsigned int ia_threshold_ppt = DEFAULT_IA_THRESHOLD;
-static unsigned long long ia_threshold = PROP_FM_PPT(DEFAULT_IA_THRESHOLD);
-#define LOWER_MAX_IA_SLEEP SCHED_AVG_REAL(15 * 60LL * NSEC_PER_SEC)
-#define UPPER_MAX_IA_SLEEP SCHED_AVG_REAL(2 * 60 * 60LL * NSEC_PER_SEC)
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
 
 /*
- * UNPRIV_RT tasks that have a CPU usage rate less than this threshold
- * (in parts per thousand) are treated as psuedo RT tasks
- */
-#define DEFAULT_UNPRIV_RT_THRESHOLD 10
-static unsigned int unpriv_rt_threshold_ppt = DEFAULT_UNPRIV_RT_THRESHOLD;
-static unsigned long long unpriv_rt_threshold = PROP_FM_PPT(DEFAULT_UNPRIV_RT_THRESHOLD);
-
-/*
- * What "base time slice" for nice 0 and  "average time slice" evaluated to
+ * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * to time slice values: [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
  */
-#define MSECS_TO_JIFFIES(x) (((x) * (HZ * 2 + 1)) / 2000)
-#define MSECS_TO_JIFFIES_MIN_1(x) (MSECS_TO_JIFFIES(x) ? MSECS_TO_JIFFIES(x) : 1)
-#define DEFAULT_TIME_SLICE_MSECS 100
-#define MAX_TIME_SLICE_MSECS 1000
-#define DEFAULT_TIME_SLICE_TICKS MSECS_TO_JIFFIES_MIN_1(DEFAULT_TIME_SLICE_MSECS)
 
-static unsigned int time_slice_ticks = DEFAULT_TIME_SLICE_TICKS;
-static unsigned int sched_rr_time_slice_ticks = DEFAULT_TIME_SLICE_TICKS;
-static unsigned int bgnd_time_slice_multiplier = 1;
+#define SCALE_PRIO(x, prio) \
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 
-static inline int is_bgnd_task(const task_t *p)
+static unsigned int task_timeslice(task_t *p)
 {
-	return p->cpu_rate_cap == 0;
-}
-
-static inline unsigned int task_timeslice(const task_t *p)
-{
-	if (unlikely(p->policy == SCHED_RR))
-		return sched_rr_time_slice_ticks;
-
-	if (unlikely(is_bgnd_task(p) && !(p->flags & PF_UISLEEP)))
-		return time_slice_ticks * bgnd_time_slice_multiplier;
-
-	return time_slice_ticks;
+	if (p->static_prio < NICE_TO_PRIO(0))
+		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+	else
+		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
 }
-
-#define task_hot(p, sd) ((p)->rq->timestamp_last_tick - (p)->timestamp < (sd)->cache_hot_time)
+#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
 
 /*
  * These are the runqueue data structures:
  */
-#define NUM_PRIO_SLOTS (IDLE_PRIO + 1)
 
-/*
- * Is the run queue idle?
- */
-#define RUNQUEUE_IDLE(rq) ((rq)->curr == (rq)->idle)
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 
-/*
- * Control values for niceness
- */
-#define PROSPECTIVE_BASE_PROM_INTERVAL_MSECS ((DEFAULT_TIME_SLICE_MSECS * 110) / 100)
-#if (PROSPECTIVE_BASE_PROM_INTERVAL_MSECS > 0)
-#define BASE_PROM_INTERVAL_MSECS PROSPECTIVE_BASE_PROM_INTERVAL_MSECS
-#else
-#define BASE_PROM_INTERVAL_MSECS DEFAULT_TIME_SLICE_MSECS
-#endif
-static unsigned int base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(BASE_PROM_INTERVAL_MSECS);
+typedef struct runqueue runqueue_t;
 
-struct prio_slot {
-	unsigned int prio;
-	struct list_head queue;
+struct prio_array {
+	unsigned int nr_active;
+	unsigned long bitmap[BITMAP_SIZE];
+	struct list_head queue[MAX_PRIO];
 };
 
 /*
@@ -373,23 +217,15 @@ struct runqueue {
 #ifdef CONFIG_SMP
 	unsigned long cpu_load;
 #endif
-	unsigned long avg_nr_running;
 	unsigned long long nr_switches;
-	unsigned long nr_uninterruptible;
+	unsigned long expired_timestamp, nr_uninterruptible;
 	unsigned long long timestamp_last_tick;
-	unsigned long long total_delay;
-	unsigned long long total_sinbin;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
-	DECLARE_BITMAP(bitmap, NUM_PRIO_SLOTS);
-	struct prio_slot queues[NUM_PRIO_SLOTS];
-	unsigned long next_prom_due;
-	unsigned long pcount;
+	prio_array_t *active, *expired, arrays[2];
+	int best_expired_prio;
 	atomic_t nr_iowait;
 
-	unsigned long long eb_yardstick;
-	unsigned long long eb_ticks_to_decay;
-
 #ifdef CONFIG_SMP
 	struct sched_domain *sd;
 
@@ -451,133 +287,41 @@ static DEFINE_PER_CPU(struct runqueue, r
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
+#define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-#define is_idle_task(p) ((p) == (p)->rq->idle)
-
-#ifdef CONFIG_SMP
-void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-	BUG_ON(!list_empty(&p->run_list));
-
-	p->thread_info->cpu = cpu;
-	p->rq = cpu_rq(cpu);
-}
-
-/*
- * "p"'s runqueue and "oldrq" are locked when this is called
- */
-static inline void adjust_timestamp(task_t *p, const runqueue_t *oldrq)
-{
-	p->timestamp += (p->rq->timestamp_last_tick - oldrq->timestamp_last_tick);
-}
-
-/*
- * adjust_sched_timestamp() is always called with p's runqueue locked but sometimes
- * "oldrq" isn't locked and isn't "this_rq()" (e.g. in try_to_wake_up())
- * leading to possible (very rare) problems on systems where 64 bit reads are
- * not atomic.
- *
- * We'll handle this problem by reading their "timestamp_last_tick"s until we
- * get two the same.
- */
-static inline void adjust_sched_timestamp(task_t *p, const runqueue_t *oldrq)
-{
-	unsigned long long oldrq_tlt = oldrq->timestamp_last_tick;
-
-	if (oldrq != this_rq())
-		while (unlikely(oldrq_tlt != oldrq->timestamp_last_tick))
-			oldrq_tlt = oldrq->timestamp_last_tick;
-
-	p->sched_timestamp += p->rq->timestamp_last_tick - oldrq_tlt;
-}
-
-/*
- * for use when the task may be on another CPU (to compensate for drift)
- *
- * This is only ever called when "p"'s runqueue is locked.
- * Even though "this_rq()" may not be locked this should be safe as
- * "timestamp_last_tick" is only ever changed by tasks running on the same CPU
- * and so it won't be being changed while we read it.
- */
-static inline unsigned long long adjusted_sched_clock(const task_t *p)
-{
-	runqueue_t *trq = this_rq();
-
-	return sched_clock() + (p->rq->timestamp_last_tick - trq->timestamp_last_tick);
-}
-
-#else
-#define adjust_timestamp(p, oldrq)
-#define adjust_sched_timestamp(p, oldrq)
-#define adjusted_sched_clock(p) sched_clock()
-#endif
-
 /*
  * Default context-switch locking:
  */
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(rq, next)	do { } while (0)
 # define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_is_running(p)		((p)->rq->curr == (p))
-#else
-# define task_is_running(p) task_running((p)->rq, p)
+# define task_running(rq, p)		((rq)->curr == (p))
 #endif
-#define task_is_exiting(p) (unlikely(((p)->flags & PF_EXITING) != 0))
-#define task_is_sinbinned(p) (unlikely(((p)->flags & PF_SINBINNED) != 0))
-#define task_is_unpriv_rt(p) (unlikely(((p)->flags & PF_UNPRIV_RT) != 0))
-
-static inline void restart_promotions(struct runqueue *rq)
-{
-	rq->next_prom_due = jiffies + base_prom_interval_ticks;
-	rq->pcount = 1;
-}
-
-/* make it (relatively) easy to switch to using a timer */
-static inline void stop_promotions(struct runqueue *rq)
-{
-}
-
-static inline void decay_eb_yardstick(runqueue_t *rq)
-{
-	static const unsigned long long decay_per_interval = PROP_FM_PPT(990);
-	unsigned long long pny; /* potential new yardstick */
-
-	rq->eb_yardstick = map_proportion(decay_per_interval, rq->eb_yardstick);
-	rq->eb_ticks_to_decay = time_slice_ticks;
-	if (unlikely(rt_task(rq->curr) || is_bgnd_task(rq->curr)))
-		return;
-	if (rq->curr->cpu_usage_rate < rq->curr->cpu_rate_cap)
-		pny = sched_div_64(rq->curr->cpu_usage_rate, rq->curr->eb_shares);
-	else
-		pny = sched_div_64(rq->curr->cpu_rate_cap, rq->curr->eb_shares);
-	if (pny > rq->eb_yardstick)
-		rq->eb_yardstick = pny;
-}
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static spinlock_t *task_rq_lock(const task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 {
-	spinlock_t *rql;
+	struct runqueue *rq;
 
 repeat_lock_task:
 	local_irq_save(*flags);
-	rql = &p->rq->lock;
-	spin_lock(rql);
-	if (unlikely(rql != &p->rq->lock)) {
-		spin_unlock_irqrestore(rql, *flags);
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock_irqrestore(&rq->lock, *flags);
 		goto repeat_lock_task;
 	}
-	return rql;
+	return rq;
 }
 
-static inline void task_rq_unlock(spinlock_t *rql, unsigned long *flags)
+static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 {
-	spin_unlock_irqrestore(rql, *flags);
+	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
 #ifdef CONFIG_SCHEDSTATS
@@ -682,7 +426,7 @@ struct file_operations proc_schedstat_op
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
-static spinlock_t *this_rq_lock(void)
+static runqueue_t *this_rq_lock(void)
 {
 	runqueue_t *rq;
 
@@ -690,7 +434,12 @@ static spinlock_t *this_rq_lock(void)
 	rq = this_rq();
 	spin_lock(&rq->lock);
 
-	return &rq->lock;
+	return rq;
+}
+
+static inline void rq_unlock(runqueue_t *rq)
+{
+	spin_unlock_irq(&rq->lock);
 }
 
 #ifdef CONFIG_SCHEDSTATS
@@ -722,6 +471,7 @@ static inline void sched_info_dequeued(t
 static inline void sched_info_arrive(task_t *t)
 {
 	unsigned long now = jiffies, diff = 0;
+	struct runqueue *rq = task_rq(t);
 
 	if (t->sched_info.last_queued)
 		diff = now - t->sched_info.last_queued;
@@ -730,11 +480,11 @@ static inline void sched_info_arrive(tas
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcnt++;
 
-	if (!t->rq)
+	if (!rq)
 		return;
 
-	t->rq->rq_sched_info.run_delay += diff;
-	t->rq->rq_sched_info.pcnt++;
+	rq->rq_sched_info.run_delay += diff;
+	rq->rq_sched_info.pcnt++;
 }
 
 /*
@@ -764,12 +514,13 @@ static inline void sched_info_queued(tas
  */
 static inline void sched_info_depart(task_t *t)
 {
+	struct runqueue *rq = task_rq(t);
 	unsigned long diff = jiffies - t->sched_info.last_arrival;
 
 	t->sched_info.cpu_time += diff;
 
-	if (t->rq)
-		t->rq->rq_sched_info.cpu_time += diff;
+	if (rq)
+		rq->rq_sched_info.cpu_time += diff;
 }
 
 /*
@@ -779,7 +530,7 @@ static inline void sched_info_depart(tas
  */
 static inline void sched_info_switch(task_t *prev, task_t *next)
 {
-	struct runqueue *rq = prev->rq;
+	struct runqueue *rq = task_rq(prev);
 
 	/*
 	 * prev now departs the cpu.  It's not interesting to record
@@ -797,43 +548,24 @@ static inline void sched_info_switch(tas
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
-static inline int task_preempts_curr(const struct task_struct *p)
-{
-	return (p->prio < p->rq->curr->prio) && !task_is_exiting(p->rq->curr);
-}
-
-static inline int task_queued(const task_t *task)
-{
-	return !list_empty(&task->run_list);
-}
-
 /*
- * Adding/removing a task to/from a runqueue:
+ * Adding/removing a task to/from a priority array:
  */
-static void dequeue_task(struct task_struct *p)
+static void dequeue_task(struct task_struct *p, prio_array_t *array)
 {
-	/*
-	 * If p is the last task in this priority slot then slotp will be
-	 * a pointer to the head of the list in the sunqueue structure
-	 * NB we can't use p->prio for bitmap as task may have been
-	 * promoted
-	 */
-	struct list_head *slotp = p->run_list.next;
-
-	/*
-	 * Initialize after removal from the list so that list_empty() works
-	 * as a means for testing whether the task is runnable
-	 */
-	list_del_init(&p->run_list);
-	if (list_empty(slotp))
-		__clear_bit(list_entry(slotp, struct prio_slot, queue)->prio, p->rq->bitmap);
+	array->nr_active--;
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
 }
 
-static void enqueue_task(struct task_struct *p)
+static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
 	sched_info_queued(p);
-	list_add_tail(&p->run_list, &p->rq->queues[p->prio].queue);
-	__set_bit(p->prio, p->rq->bitmap);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
 }
 
 /*
@@ -841,447 +573,196 @@ static void enqueue_task(struct task_str
  * remote queue so we want these tasks to show up at the head of the
  * local queue:
  */
-static inline void enqueue_task_head(struct task_struct *p)
+static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 {
-	list_add(&p->run_list, &p->rq->queues[p->prio].queue);
-	__set_bit(p->prio, p->rq->bitmap);
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
 }
 
 /*
  * effective_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into the -5 ... 0 ... +5 bonus/penalty range.
+ *
+ * We use 25% of the full 0...39 priority range so that:
+ *
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ *
+ * Both properties are important to certain workloads.
  */
-static inline int effective_prio(const task_t *p)
+static int effective_prio(task_t *p)
 {
-	unsigned int bonus_factor = 0;
+	int bonus, prio;
 
 	if (rt_task(p))
 		return p->prio;
 
-	if (unlikely(is_bgnd_task(p) && !(p->flags & PF_UISLEEP)))
-		return BGND_PRIO;
-
-	if (task_is_unpriv_rt(p) && (p->cpu_usage_rate < unpriv_rt_threshold))
-		return MAX_RT_PRIO;
-
-	/*
-	 * kernel threads get maximum bonuses and tasks that are
-	 * over their cap get no bonuses
-	 */
-	if (p->mm == NULL)
-		bonus_factor = MAX_TOTAL_BONUS;
-	else if (p->cpu_usage_rate < p->cpu_rate_cap) {
-		bonus_factor = SCHED_IA_BONUS_RND(p->interactive_bonus);
-		bonus_factor += p->throughput_bonus;
-	}
+	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
-	return p->pre_bonus_priority - bonus_factor;
+	prio = p->static_prio - bonus;
+	if (prio < MAX_RT_PRIO)
+		prio = MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		prio = MAX_PRIO-1;
+	return prio;
 }
 
 /*
  * __activate_task - move a task to the runqueue.
  */
-static inline void __activate_task(task_t *p)
+static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
-	enqueue_task(p);
-	p->rq->nr_running++;
-	if (p->rq->nr_running == 2)
-		restart_promotions(p->rq);
+	enqueue_task(p, rq->active);
+	rq->nr_running++;
 }
 
 /*
- * activate task on the _front_ of runqueue.
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
  */
-static inline void __activate_task_head(task_t *p)
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
-	enqueue_task_head(p);
-	p->rq->nr_running++;
-	if (p->rq->nr_running == 2)
-		restart_promotions(p->rq);
+	enqueue_task_head(p, rq->active);
+	rq->nr_running++;
 }
 
-/*
- * Calculate CPU usage rate and sleepiness.
- * This never gets called on real time tasks
- */
-static void decay_avgs_and_calculate_rates(task_t *p)
-{
-	unsigned long long bl;
-
-	apply_sched_avg_decay(&p->avg_sleep_per_cycle);
-	apply_sched_avg_decay(&p->avg_delay_per_cycle);
-	apply_sched_avg_decay(&p->avg_cpu_per_cycle);
-	bl  = p->avg_sleep_per_cycle + p->avg_cpu_per_cycle;
-	/*
-	 * Take a shortcut and avoid possible divide by zero later
-	 */
-	if (unlikely(bl == 0)) {
-		p->sleepiness = PROPORTION_ONE;
-		p->cpu_usage_rate = 0;
-	} else {
-		p->sleepiness = calc_proportion(p->avg_sleep_per_cycle, bl);
-		bl += p->avg_delay_per_cycle;
-		p->cpu_usage_rate = calc_proportion(p->avg_cpu_per_cycle, bl);
-	}
-}
-
-/*
- * Calculate priority based priority (without bonuses).
- * This never gets called on real time tasks
- */
-static inline void calculate_pb_pre_bonus_priority(task_t *p)
-{
-	if (unlikely(p->cpu_usage_rate > p->cpu_rate_cap)) {
-		p->pre_bonus_priority = BGND_PRIO - 1;
-		if (p->cpu_rate_cap != 0) {
-			unsigned long long prop = PROPORTION_ONE;
-
-			prop -= calc_proportion(p->cpu_rate_cap, p->cpu_usage_rate);
-			p->pre_bonus_priority -= map_proportion(prop, MAX_PRIO - p->static_prio);
-		}
-	} else
-		p->pre_bonus_priority = p->static_prio + MAX_TOTAL_BONUS;
-}
-
-/*
- * Calculate entitlement based priority (without bonuses).
- * This never gets called on real time tasks
- */
-#define EB_PAR ((MAX_PRIO - MAX_RT_PRIO - 1) / 2)
-static void calculate_eb_pre_bonus_priority(task_t *p)
+static void recalc_task_prio(task_t *p, unsigned long long now)
 {
-	/*
-	 * Prevent possible divide by zero and take shortcut
-	 */
-	if (unlikely(p->cpu_rate_cap == 0)) {
-		p->pre_bonus_priority = BGND_PRIO - 1;
-	} else if (p->cpu_usage_rate > p->cpu_rate_cap) {
-		unsigned long long cap_per_share = sched_div_64(p->cpu_rate_cap, p->eb_shares);
-		unsigned long long prop = calc_proportion(p->cpu_rate_cap, p->cpu_usage_rate);
-
-		p->pre_bonus_priority = (BGND_PRIO - 1);
-		p->pre_bonus_priority -= map_proportion_rnd(prop, EB_PAR + 1);
-		if (cap_per_share > p->rq->eb_yardstick)
-			p->rq->eb_yardstick = cap_per_share;
-	} else {
-		unsigned long long usage_per_share = sched_div_64(p->cpu_usage_rate, p->eb_shares);
-
-		if (usage_per_share > p->rq->eb_yardstick) {
-			p->rq->eb_yardstick = usage_per_share;
-			p->pre_bonus_priority = MAX_RT_PRIO + MAX_TOTAL_BONUS + EB_PAR;
-		} else {
-			unsigned long long prop;
+	unsigned long long __sleep_time = now - p->timestamp;
+	unsigned long sleep_time;
 
-			prop = calc_proportion(usage_per_share, p->rq->eb_yardstick);
-			p->pre_bonus_priority = MAX_RT_PRIO + MAX_TOTAL_BONUS;
-			p->pre_bonus_priority += map_proportion_rnd(prop, EB_PAR);
-		}
-	}
-}
-
-static inline void calculate_pre_bonus_priority(task_t *p)
-{
-	if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED)
-		calculate_eb_pre_bonus_priority(p);
+	if (__sleep_time > NS_MAX_SLEEP_AVG)
+		sleep_time = NS_MAX_SLEEP_AVG;
 	else
-		calculate_pb_pre_bonus_priority(p);
-}
-
-/*
- * Initialize the scheduling statistics counters
- */
-static inline void initialize_stats(task_t *p)
-{
-	p->avg_sleep_per_cycle = 0;
-	p->avg_delay_per_cycle = 0;
-	p->avg_cpu_per_cycle = 0;
-	p->total_sleep = 0;
-	p->total_delay = 0;
-	p->total_cpu = 0;
-	p->total_sinbin = 0;
-	p->cycle_count = 0;
-	p->intr_wake_ups = 0;
-	p->sched_timestamp = sched_clock();
-}
-
-/*
- * sched_clock() is not necessarily monotonic and this can lead to negative
- * values when very small time intervals are measured using successive calls
- * to sched_clock().  The "delay" statistic is the most vulnerable to this BUT
- * we'll take precautions for all interval measurements.  Where a time interval
- * would be negative we'll treat it as zero and NOT update the timestamp either
- * as this would lead to the next interval measured being to big.
- */
-static inline void delta_sleep_stats(task_t *p, unsigned long long now)
-{
-	unsigned long long delta;
-
-	/* sched_clock() is not guaranteed monotonic */
-	if (now <= p->sched_timestamp) {
-		p->sched_timestamp = now;
-		return;
-	}
-
-	delta = now - p->sched_timestamp;
-	p->sched_timestamp = now;
-	p->avg_sleep_per_cycle += delta;
-	p->total_sleep += delta;
-}
+		sleep_time = (unsigned long)__sleep_time;
 
-static inline void delta_cpu_stats(task_t *p, unsigned long long now)
-{
-	unsigned long long delta;
+	if (likely(sleep_time > 0)) {
+		/*
+		 * User tasks that sleep a long time are categorised as
+		 * idle and will get just interactive status to stay active &
+		 * prevent them suddenly becoming cpu hogs and starving
+		 * other processes.
+		 */
+		if (p->mm && p->activated != -1 &&
+			sleep_time > INTERACTIVE_SLEEP(p)) {
+				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+						DEF_TIMESLICE);
+				if (!HIGH_CREDIT(p))
+					p->interactive_credit++;
+		} else {
+			/*
+			 * The lower the sleep avg a task has the more
+			 * rapidly it will rise with sleep time.
+			 */
+			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 
-	/* sched_clock() is not guaranteed monotonic */
-	if (now <= p->sched_timestamp) {
-		p->sched_timestamp = now;
-		return;
-	}
+			/*
+			 * Tasks with low interactive_credit are limited to
+			 * one timeslice worth of sleep avg bonus.
+			 */
+			if (LOW_CREDIT(p) &&
+			    sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
+				sleep_time = JIFFIES_TO_NS(task_timeslice(p));
 
-	delta = now - p->sched_timestamp;
-	p->sched_timestamp = now;
-	p->avg_cpu_per_cycle += delta;
-	p->total_cpu += delta;
-}
+			/*
+			 * Non high_credit tasks waking from uninterruptible
+			 * sleep are limited in their sleep_avg rise as they
+			 * are likely to be cpu hogs waiting on I/O
+			 */
+			if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
+				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
+					sleep_time = 0;
+				else if (p->sleep_avg + sleep_time >=
+						INTERACTIVE_SLEEP(p)) {
+					p->sleep_avg = INTERACTIVE_SLEEP(p);
+					sleep_time = 0;
+				}
+			}
 
-static inline void delta_delay_stats(task_t *p, unsigned long long now)
-{
-	unsigned long long delta;
+			/*
+			 * This code gives a bonus to interactive tasks.
+			 *
+			 * The boost works by updating the 'average sleep time'
+			 * value here, based on ->timestamp. The more time a
+			 * task spends sleeping, the higher the average gets -
+			 * and the higher the priority boost gets as well.
+			 */
+			p->sleep_avg += sleep_time;
 
-	/* sched_clock() is not guaranteed monotonic */
-	if (now <= p->sched_timestamp) {
-		p->sched_timestamp = now;
-		return;
+			if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
+				p->sleep_avg = NS_MAX_SLEEP_AVG;
+				if (!HIGH_CREDIT(p))
+					p->interactive_credit++;
+			}
+		}
 	}
 
-	delta = now - p->sched_timestamp;
-	p->sched_timestamp = now;
-	p->avg_delay_per_cycle += delta;
-	p->total_delay += delta;
-	p->rq->total_delay += delta;
-	if (task_is_sinbinned(p)) {
-		p->total_sinbin += delta;
-		p->rq->total_sinbin += delta;
-	}
+	p->prio = effective_prio(p);
 }
 
 /*
- * Update various statistics for the end of a
- * ((on_run_queue :-> on_cpu)* :-> sleep) cycle.
- * We can't just do this in activate_task() as every invocation of that
- * function is not the genuine end of a cycle.
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
  */
-static void update_stats_for_cycle(task_t *p)
+static void activate_task(task_t *p, runqueue_t *rq, int local)
 {
-	unsigned long long now = adjusted_sched_clock(p);
-
-	delta_sleep_stats(p, now);
-	if (in_interrupt())
-		p->intr_wake_ups++;
-	p->cycle_count++;
-	if (!rt_task(p))
-		decay_avgs_and_calculate_rates(p);
-}
-
-static inline void decay_sched_ia_bonus(struct task_struct *p)
-{
-	p->interactive_bonus *= SCHED_IA_BONUS_ALPHA;
-	p->interactive_bonus >>= SCHED_IA_BONUS_OFFSET;
-}
+	unsigned long long now;
 
-/*
- * Check whether a task with an interactive bonus still qualifies and if not
- * decrease its bonus
- * This never gets called on real time tasks
- */
-static void reassess_cpu_boundness(task_t *p)
-{
-	if (max_ia_bonus == 0) {
-		p->interactive_bonus = 0;
-		return;
+	now = sched_clock();
+#ifdef CONFIG_SMP
+	if (!local) {
+		/* Compensate for drifting sched_clock */
+		runqueue_t *this_rq = this_rq();
+		now = (now - this_rq->timestamp_last_tick)
+			+ rq->timestamp_last_tick;
 	}
-	/*
-	 * No point going any further if there's no bonus to lose
-	 */
-	if (p->interactive_bonus == 0)
-		return;
+#endif
 
-	if (p->cpu_usage_rate > cpu_hog_threshold)
-		decay_sched_ia_bonus(p);
-}
+	recalc_task_prio(p, now);
 
-/*
- * Check whether a task qualifies for an interactive bonus and if it does
- * increase its bonus
- * This never gets called on real time tasks
- */
-static void reassess_interactiveness(task_t *p)
-{
-	if (max_ia_bonus == 0) {
-		p->interactive_bonus = 0;
-		return;
-	}
 	/*
-	 * No sleep means not interactive (in most cases), but
+	 * This checks to make sure it's not an uninterruptible task
+	 * that is now waking up.
 	 */
-	if (unlikely(p->avg_sleep_per_cycle > LOWER_MAX_IA_SLEEP)) {
+	if (!p->activated) {
 		/*
-		 * Really long sleeps mean it's probably not interactive
+		 * Tasks which were woken up by interrupts (ie. hw events)
+		 * are most likely of interactive nature. So we give them
+		 * the credit of extending their sleep time to the period
+		 * of time they spend on the runqueue, waiting for execution
+		 * on a CPU, first time around:
 		 */
-		if (unlikely(p->avg_sleep_per_cycle > UPPER_MAX_IA_SLEEP))
-			decay_sched_ia_bonus(p);
-		return;
-	}
-	if (p->sleepiness > ia_threshold) {
-		decay_sched_ia_bonus(p);
-		p->interactive_bonus += map_proportion_rnd(p->sleepiness, max_ia_bonus);
-	}
-}
-
-/*
- * Check whether a task qualifies for a throughput bonus and if it does
- * give it one
- * This never gets called on real time tasks
- */
-static void recalc_throughput_bonus(task_t *p)
-{
-	unsigned long long ratio;
-	unsigned long long expected_delay;
-	unsigned long long adjusted_delay;
-	unsigned long long load = p->rq->avg_nr_running;
-
-	p->throughput_bonus = 0;
-	if (max_tpt_bonus == 0)
-		return;
-
-	if (load <= SCHED_AVG_ONE)
-		expected_delay = 0;
-	else
-		expected_delay = SCHED_AVG_MUL(p->avg_cpu_per_cycle, (load - SCHED_AVG_ONE));
-
-	/*
-	 * No unexpected delay means no bonus, but
-	 * NB this test also avoids a possible divide by zero error if
-	 * cpu is also zero and negative bonuses
-	 */
-	if (p->avg_delay_per_cycle <= expected_delay)
-		return;
-
-	adjusted_delay  = p->avg_delay_per_cycle - expected_delay;
-	ratio = calc_proportion(adjusted_delay, adjusted_delay + p->avg_cpu_per_cycle);
-	ratio = proportion_sqrt(ratio);
-	p->throughput_bonus = map_proportion_rnd(ratio, max_tpt_bonus);
-}
-
-static void recalc_task_prio(task_t *p, unsigned long long now)
-{
-	/*
-	 * Throughput bonus is dependent on how busy the CPU is so do it here to
-	 * catch any CPU changes
-	 * Interactive bonus is updated in the wake up function.
-	 */
-	if (!rt_task(p)) {
-		recalc_throughput_bonus(p);
-		calculate_pre_bonus_priority(p);
+		if (in_interrupt())
+			p->activated = 2;
+		else {
+			/*
+			 * Normal first-time wakeups get a credit too for
+			 * on-runqueue time, but it will be weighted down:
+			 */
+			p->activated = 1;
+		}
 	}
-	p->prio = effective_prio(p);
-}
-
-/*
- * activate_task - move a task to the runqueue and do priority recalculation
- */
-static void activate_task(task_t *p)
-{
-	/* Compensate for drifting sched_clock */
-	unsigned long long now = adjusted_sched_clock(p);
-
-	recalc_task_prio(p, now);
 	p->timestamp = now;
-	p->time_slice = task_timeslice(p);
-	p->flags &= ~PF_UISLEEP;
 
-	__activate_task(p);
+	__activate_task(p, rq);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct task_struct *p)
+static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	p->rq->nr_running--;
-	if (p->state == TASK_UNINTERRUPTIBLE) {
-		p->flags |= PF_UISLEEP;
-		p->rq->nr_uninterruptible++;
-	}
-	dequeue_task(p);
-	if (p->rq->nr_running == 1)
-		stop_promotions(p->rq);
-}
-
-/*
- * p->cpu_usage_rate must be greater than p->cpu_rate_hard_cap
- */
-static inline unsigned long required_sinbin_period(const task_t *p)
-{
-	unsigned long long acpc_jiffies, bl, tl;
-
-	if (p->cpu_rate_hard_cap == 0)
-		return ULONG_MAX;
-
-	acpc_jiffies = sched_div_64(SCHED_AVG_RND(p->avg_cpu_per_cycle) * HZ, 1000000000);
-	/*
-	 * we have to be careful about overflow and/or underflow
-	 */
-	bl = p->cpu_usage_rate * p->cpu_rate_hard_cap;
-	tl = acpc_jiffies * (p->cpu_usage_rate - p->cpu_rate_hard_cap);
-	while (tl > PROPORTION_OVERFLOW) {
-		tl >>= 1;
-		if (unlikely((bl >>= 1) == 0))
-			return ULONG_MAX;
-	}
-
-	return sched_div_64(tl << PROPORTION_OFFSET, bl);
-}
-
-static inline int task_needs_sinbinning(const struct task_struct *p)
-{
-	return (p->cpu_usage_rate > p->cpu_rate_hard_cap) &&
-		(p->state == TASK_RUNNING) && !rt_task(p) && !task_is_exiting(p);
-}
-
-static inline void put_task_in_sinbin(struct task_struct *p)
-{
-	unsigned long long durn = required_sinbin_period(p);
-
-	if (durn == 0)
-		return;
-	deactivate_task(p);
-	p->flags |= PF_SINBINNED;
-	p->sinbin_timer.expires = jiffies + durn;
-	add_timer(&p->sinbin_timer);
-}
-
-/*
- * Release a task from the sinbin
- */
-void sinbin_release_fn(unsigned long arg)
-{
-	unsigned long flags;
-	struct task_struct *p = (struct task_struct*)arg;
-	spinlock_t *rql = task_rq_lock(p, &flags);
-
-	/*
-	 * Sinbin time is included in delay time
-	 */
-	delta_delay_stats(p, adjusted_sched_clock(p));
-	p->flags &= ~PF_SINBINNED;
-	if (!rt_task(p)) {
-		calculate_pre_bonus_priority(p);
-		p->prio = effective_prio(p);
-	}
-	__activate_task(p);
-
-	task_rq_unlock(rql, &flags);
+	rq->nr_running--;
+	if (p->state == TASK_UNINTERRUPTIBLE)
+		rq->nr_uninterruptible++;
+	dequeue_task(p, p->array);
+	p->array = NULL;
 }
 
 /*
@@ -1296,7 +777,7 @@ static void resched_task(task_t *p)
 {
 	int need_resched, nrpolling;
 
-	BUG_ON(!spin_is_locked(&p->rq->lock));
+	BUG_ON(!spin_is_locked(&task_rq(p)->lock));
 
 	/* minimise the chance of sending an interrupt to poll_idle() */
 	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
@@ -1313,19 +794,13 @@ static inline void resched_task(task_t *
 }
 #endif
 
-static inline void preempt_curr_if_warranted(struct task_struct *p)
-{
-	if (task_preempts_curr(p))
-		resched_task(p->rq->curr);
-}
-
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const task_t *p)
 {
-	return task_is_running(p);
+	return cpu_curr(task_cpu(p)) == p;
 }
 
 #ifdef CONFIG_SMP
@@ -1354,18 +829,14 @@ typedef struct {
  */
 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 {
+	runqueue_t *rq = task_rq(p);
+
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!task_queued(p) && !task_is_running(p)) {
-		if (task_is_sinbinned(p))
-			delta_delay_stats(p, adjusted_sched_clock(p));
-		else
-			delta_sleep_stats(p, adjusted_sched_clock(p));
+	if (!p->array && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
-		/* time stamp was set for old queue above so fix it */
-		p->sched_timestamp = adjusted_sched_clock(p);
 		return 0;
 	}
 
@@ -1373,7 +844,7 @@ static int migrate_task(task_t *p, int d
 	req->type = REQ_MOVE_TASK;
 	req->task = p;
 	req->dest_cpu = dest_cpu;
-	list_add(&req->list, &p->rq->migration_queue);
+	list_add(&req->list, &rq->migration_queue);
 	return 1;
 }
 
@@ -1389,22 +860,22 @@ static int migrate_task(task_t *p, int d
 void wait_task_inactive(task_t * p)
 {
 	unsigned long flags;
-	spinlock_t *rql;
+	runqueue_t *rq;
 	int preempted;
 
 repeat:
-	rql = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
-	if (unlikely(task_queued(p))) {
+	if (unlikely(p->array)) {
 		/* If it's preempted, we yield.  It could be a while. */
-		preempted = !task_is_running(p);
-		task_rq_unlock(rql, &flags);
+		preempted = !task_running(rq, p);
+		task_rq_unlock(rq, &flags);
 		cpu_relax();
 		if (preempted)
 			yield();
 		goto repeat;
 	}
-	task_rq_unlock(rql, &flags);
+	task_rq_unlock(rq, &flags);
 }
 
 /***
@@ -1511,34 +982,27 @@ static int try_to_wake_up(task_t * p, un
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
-	spinlock_t *rql;
-	runqueue_t *old_rq;
+	runqueue_t *rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
 	struct sched_domain *sd;
 	int new_cpu;
 #endif
 
-	rql = task_rq_lock(p, &flags);
-	old_rq = p->rq;
-	schedstat_inc(p->rq, ttwu_cnt);
+	rq = task_rq_lock(p, &flags);
+	schedstat_inc(rq, ttwu_cnt);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
 
-	if (task_queued(p))
+	if (p->array)
 		goto out_running;
 
-	/*
-	 * This is the end of one scheduling cycle and the start of the next
-	 */
-	update_stats_for_cycle(p);
-
 	cpu = task_cpu(p);
 	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
-	if (unlikely(task_is_running(p)))
+	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
 	new_cpu = cpu;
@@ -1575,7 +1039,7 @@ static int try_to_wake_up(task_t * p, un
 		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
 
 		if ((sd->flags & SD_WAKE_AFFINE) &&
-				!task_hot(p, sd)) {
+				!task_hot(p, rq->timestamp_last_tick, sd)) {
 			/*
 			 * This domain has SD_WAKE_AFFINE and p is cache cold
 			 * in this domain.
@@ -1599,19 +1063,18 @@ static int try_to_wake_up(task_t * p, un
 
 	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
 out_set_cpu:
-	schedstat_inc(p->rq, ttwu_attempts);
+	schedstat_inc(rq, ttwu_attempts);
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
-		schedstat_inc(p->rq, ttwu_moved);
+		schedstat_inc(rq, ttwu_moved);
 		set_task_cpu(p, new_cpu);
-		task_rq_unlock(rql, &flags);
+		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
-		rql = task_rq_lock(p, &flags);
-		adjust_sched_timestamp(p, old_rq);
+		rq = task_rq_lock(p, &flags);
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
-		if (task_queued(p))
+		if (p->array)
 			goto out_running;
 
 		this_cpu = smp_processor_id();
@@ -1620,17 +1083,16 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE)
-		old_rq->nr_uninterruptible--;
+	if (old_state == TASK_UNINTERRUPTIBLE) {
+		rq->nr_uninterruptible--;
+		/*
+		 * Tasks on involuntary sleep don't earn
+		 * sleep_avg beyond just interactive state.
+		 */
+		p->activated = -1;
+	}
 
 	/*
-	 * Do this here rather than in activate_task() because activate() gets
-	 * called at times when thes calculations are unnecessary e.g. for a
-	 * change of CPU
-	 */
-	if (!rt_task(p))
-		reassess_interactiveness(p);
-	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
 	 * don't trigger a preemption, if the woken up task will run on
@@ -1638,15 +1100,17 @@ out_activate:
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
-	activate_task(p);
-	if (!sync || cpu != this_cpu)
-		preempt_curr_if_warranted(p);
+	activate_task(p, rq, cpu == this_cpu);
+	if (!sync || cpu != this_cpu) {
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
 	success = 1;
 
 out_running:
 	p->state = TASK_RUNNING;
 out:
-	task_rq_unlock(rql, &flags);
+	task_rq_unlock(rq, &flags);
 
 	return success;
 }
@@ -1665,21 +1129,11 @@ int fastcall wake_up_state(task_t *p, un
 }
 
 #ifdef CONFIG_SMP
-static int find_idlest_cpu(const struct task_struct *p, int this_cpu,
+static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 			   struct sched_domain *sd);
 #endif
 
 /*
- * Initialize the scheduling bonuses
- */
-static inline void initialize_bonuses(task_t *p)
-{
-	p->interactive_bonus = (max_ia_bonus >= initial_ia_bonus) ?
-				initial_ia_bonus : max_ia_bonus;
-	p->throughput_bonus =  0;
-}
-
-/*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
@@ -1693,9 +1147,8 @@ void fastcall sched_fork(task_t *p)
 	 */
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
+	p->array = NULL;
 	spin_lock_init(&p->switch_lock);
-	init_timer(&p->sinbin_timer);
-	p->sinbin_timer.data = (unsigned long) p;
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
@@ -1709,15 +1162,32 @@ void fastcall sched_fork(task_t *p)
 	p->thread_info->preempt_count = 1;
 #endif
 	/*
-	 * Give the child a new timeslice
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness.
 	 */
-	p->time_slice = task_timeslice(p);
-	p->timestamp = sched_clock();
+	local_irq_disable();
+	p->time_slice = (current->time_slice + 1) >> 1;
 	/*
-	 * Initialize the scheduling statistics and bonus counters
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
 	 */
-	initialize_stats(p);
-	initialize_bonuses(p);
+	p->first_time_slice = 1;
+	current->time_slice >>= 1;
+	p->timestamp = sched_clock();
+	if (unlikely(!current->time_slice)) {
+		/*
+		 * This case is rare, it happens when the parent has only
+		 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->time_slice = 1;
+		preempt_disable();
+		scheduler_tick(0, 0);
+		local_irq_enable();
+		preempt_enable();
+	} else
+		local_irq_enable();
 }
 
 /*
@@ -1731,15 +1201,27 @@ void fastcall wake_up_new_task(task_t * 
 {
 	unsigned long flags;
 	int this_cpu, cpu;
-	spinlock_t *rql;
+	runqueue_t *rq, *this_rq;
 
-	rql = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	cpu = task_cpu(p);
 	this_cpu = smp_processor_id();
 
 	BUG_ON(p->state != TASK_RUNNING);
 
-	schedstat_inc(p->rq, wunt_cnt);
+	schedstat_inc(rq, wunt_cnt);
+	/*
+	 * We decrease the sleep average of forking parents
+	 * and children as well, to keep max-interactive tasks
+	 * from forking tasks that are max-interactive. The parent
+	 * (current) is done further down, under its lock.
+	 */
+	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->interactive_credit = 0;
+
+	p->prio = effective_prio(p);
 
 	if (likely(cpu == this_cpu)) {
 		if (!(clone_flags & CLONE_VM)) {
@@ -1747,61 +1229,82 @@ void fastcall wake_up_new_task(task_t * 
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
-			 * Now that the idle task is back on the run queue
-			 * we need extra care to make sure that its one and
-			 * only fork() doesn't end up in the idle priority slot.
-			 * Just testing for empty run list is no longer adequate.
 			 */
-			if (unlikely(!task_queued(current) || RUNQUEUE_IDLE(current->rq))) {
-				p->prio = effective_prio(p);
-				__activate_task(p);
-			} else {
-				/*
-				 * Put the child on the same list(s) as (but
-				 *  ahead of) the parent
-				 */
+			if (unlikely(!current->array))
+				__activate_task(p, rq);
+			else {
 				p->prio = current->prio;
 				list_add_tail(&p->run_list, &current->run_list);
-				current->rq->nr_running++;
+				p->array = current->array;
+				p->array->nr_active++;
+				rq->nr_running++;
 			}
 			set_need_resched();
-		} else {
+		} else
 			/* Run child last */
-			p->prio = effective_prio(p);
-			__activate_task(p);
-		}
+			__activate_task(p, rq);
+		/*
+		 * We skip the following code due to cpu == this_cpu
+	 	 *
+		 *   task_rq_unlock(rq, &flags);
+		 *   this_rq = task_rq_lock(current, &flags);
+		 */
+		this_rq = rq;
 	} else {
+		this_rq = cpu_rq(this_cpu);
+
 		/*
 		 * Not the local CPU - must adjust timestamp. This should
 		 * get optimised away in the !CONFIG_SMP case.
 		 */
-		adjust_timestamp(p, this_rq());
-		adjust_sched_timestamp(p, this_rq());
-		p->prio = effective_prio(p);
-		__activate_task(p);
-		preempt_curr_if_warranted(p);
-		schedstat_inc(p->rq, wunt_moved);
+		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+					+ rq->timestamp_last_tick;
+		__activate_task(p, rq);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+
+		schedstat_inc(rq, wunt_moved);
+		/*
+		 * Parent and child are on different CPUs, now get the
+		 * parent runqueue to update the parent's ->sleep_avg:
+		 */
+		task_rq_unlock(rq, &flags);
+		this_rq = task_rq_lock(current, &flags);
 	}
-	task_rq_unlock(rql, &flags);
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+	task_rq_unlock(this_rq, &flags);
 }
 
-/**
- * (Optionally) log scheduler statistics at exit.
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many threads.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
  */
-static int log_at_exit = 0;
 void fastcall sched_exit(task_t * p)
 {
-	struct task_sched_stats stats;
-
-	if (!log_at_exit)
-		return;
+	unsigned long flags;
+	runqueue_t *rq;
 
-	get_task_sched_stats(p, &stats);
-	printk("SCHED_EXIT[%d] (%s) %llu %llu %llu %llu %llu %llu %lu %lu\n",
-		p->pid, p->comm,
-		stats.total_sleep, stats.total_cpu, stats.total_delay,
-		stats.total_sinbin, stats.cycle_count, stats.intr_wake_ups,
-		p->nvcsw, p->nivcsw);
+	/*
+	 * If the child was a (relative-) CPU hog then decrease
+	 * the sleep_avg of the parent as well.
+	 */
+	rq = task_rq_lock(p->parent, &flags);
+	if (p->first_time_slice) {
+		p->parent->time_slice += p->time_slice;
+		if (unlikely(p->parent->time_slice > task_timeslice(p)))
+			p->parent->time_slice = task_timeslice(p);
+	}
+	if (p->sleep_avg < p->parent->sleep_avg)
+		p->parent->sleep_avg = p->parent->sleep_avg /
+		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+		(EXIT_WEIGHT + 1);
+	task_rq_unlock(rq, &flags);
 }
 
 /**
@@ -1986,7 +1489,7 @@ static void double_lock_balance(runqueue
 /*
  * find_idlest_cpu - find the least busy runqueue.
  */
-static int find_idlest_cpu(const struct task_struct *p, int this_cpu,
+static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 			   struct sched_domain *sd)
 {
 	unsigned long load, min_load, this_load;
@@ -2037,28 +1540,28 @@ static int find_idlest_cpu(const struct 
 static void sched_migrate_task(task_t *p, int dest_cpu)
 {
 	migration_req_t req;
-	spinlock_t *rql;
+	runqueue_t *rq;
 	unsigned long flags;
 
-	rql = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 
-	schedstat_inc(p->rq, smt_cnt);
+	schedstat_inc(rq, smt_cnt);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
-		struct task_struct *mt = p->rq->migration_thread;
+		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
-		task_rq_unlock(rql, &flags);
+		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		return;
 	}
 out:
-	task_rq_unlock(rql, &flags);
+	task_rq_unlock(rq, &flags);
 }
 
 /*
@@ -2101,26 +1604,29 @@ out:
  * Both runqueues must be locked.
  */
 static inline
-void pull_task(task_t *p,  int this_cpu)
+void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
+	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
-	runqueue_t *src_rq = p->rq;
-
-	dequeue_task(p);
+	dequeue_task(p, src_array);
 	src_rq->nr_running--;
-	delta_delay_stats(p, adjusted_sched_clock(p));
 	set_task_cpu(p, this_cpu);
-	p->rq->nr_running++;
-	enqueue_task(p);
-	adjust_timestamp(p, src_rq);
-	adjust_sched_timestamp(p, src_rq);
-	preempt_curr_if_warranted(p);
+	this_rq->nr_running++;
+	enqueue_task(p, this_array);
+	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+				+ this_rq->timestamp_last_tick;
+	/*
+	 * Note that idle threads have a prio of MAX_PRIO, for this test
+	 * to be always true for them.
+	 */
+	if (TASK_PREEMPTS_CURR(p, this_rq))
+		resched_task(this_rq->curr);
 }
 
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static inline
-int can_migrate_task(const task_t *p, int this_cpu,
+int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 		     struct sched_domain *sd, enum idle_type idle)
 {
 	/*
@@ -2129,7 +1635,7 @@ int can_migrate_task(const task_t *p, in
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (task_is_running(p))
+	if (task_running(rq, p))
 		return 0;
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
@@ -2137,7 +1643,7 @@ int can_migrate_task(const task_t *p, in
 	/* Aggressive migration if we've failed balancing */
 	if (idle == NEWLY_IDLE ||
 			sd->nr_balance_failed < sd->cache_nice_tries) {
-		if (task_hot(p, sd))
+		if (task_hot(p, rq->timestamp_last_tick, sd))
 			return 0;
 	}
 
@@ -2155,6 +1661,7 @@ static int move_tasks(runqueue_t *this_r
 		      unsigned long max_nr_move, struct sched_domain *sd,
 		      enum idle_type idle)
 {
+	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
 	int idx, pulled = 0;
 	task_t *tmp;
@@ -2162,24 +1669,45 @@ static int move_tasks(runqueue_t *this_r
 	if (max_nr_move <= 0 || busiest->nr_running <= 1)
 		goto out;
 
+	/*
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
+	 */
+	if (busiest->expired->nr_active) {
+		array = busiest->expired;
+		dst_array = this_rq->expired;
+	} else {
+		array = busiest->active;
+		dst_array = this_rq->active;
+	}
+
+new_array:
 	/* Start searching at priority 0: */
 	idx = 0;
 skip_bitmap:
 	if (!idx)
-		idx = sched_find_first_bit(busiest->bitmap);
+		idx = sched_find_first_bit(array->bitmap);
 	else
-		idx = find_next_bit(busiest->bitmap, IDLE_PRIO, idx);
-	if (idx >= IDLE_PRIO)
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == busiest->expired && busiest->active->nr_active) {
+			array = busiest->active;
+			dst_array = this_rq->active;
+			goto new_array;
+		}
 		goto out;
+	}
 
-	head = &busiest->queues[idx].queue;
+	head = array->queue + idx;
 	curr = head->prev;
 skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
 
 	curr = curr->prev;
 
-	if (!can_migrate_task(tmp, this_cpu, sd, idle)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -2194,7 +1722,7 @@ skip_queue:
 	schedstat_inc(this_rq, pt_gained[idle]);
 	schedstat_inc(busiest, pt_lost[idle]);
 
-	pull_task(tmp, this_cpu);
+	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
 
 	/* We only want to steal up to the prescribed number of tasks. */
@@ -2349,7 +1877,7 @@ out_balanced:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *find_busiest_queue(const struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
@@ -2646,11 +2174,6 @@ static void rebalance_tick(int this_cpu,
 		}
 	}
 }
-
-static inline int needs_idle_balance(const runqueue_t *rq)
-{
-	return rq->nr_running == 0;
-}
 #else
 /*
  * on UP we do not need to balance between CPUs:
@@ -2661,10 +2184,6 @@ static inline void rebalance_tick(int cp
 static inline void idle_balance(int cpu, runqueue_t *rq)
 {
 }
-static inline int needs_idle_balance(const runqueue_t *rq)
-{
-	return 0;
-}
 #endif
 
 static inline int wake_priority_sleeper(runqueue_t *rq)
@@ -2685,54 +2204,27 @@ static inline int wake_priority_sleeper(
 	return ret;
 }
 
-/*
- * Are promotions due?
- */
-static inline int promotions_due(const runqueue_t *rq)
-{
-	return unlikely(time_after_eq(jiffies, rq->next_prom_due)) && (rq->nr_running > 1);
-}
-
-/*
- * Assume runqueue lock is NOT already held.
- * This is not executed when current task is SCHED_FIFO
- */
-static void do_promotions(runqueue_t *rq)
-{
-	int idx = MIN_NORMAL_PRIO;
-
-	spin_lock(&rq->lock);
-	rq->pcount++;
-	if (rq->nr_running < rq->pcount) {
-		rq->next_prom_due = jiffies + base_prom_interval_ticks;
-		goto out_unlock;
-	}
-	for (;;) {
-		int new_prio;
-		idx = find_next_bit(rq->bitmap, IDLE_PRIO, idx + 1);
-		/* don't promote background tasks */
-		if (idx > (BGND_PRIO - 1))
-			break;
-
-		new_prio = idx - 1;
-		__list_splice(&rq->queues[idx].queue, rq->queues[new_prio].queue.prev);
-		INIT_LIST_HEAD(&rq->queues[idx].queue);
-		__clear_bit(idx, rq->bitmap);
-		__set_bit(new_prio, rq->bitmap);
-	}
-	/* The only prio field that might need updating is the current task's */
-	if (likely((rq->curr->prio > MIN_NORMAL_PRIO) && (rq->curr->prio < BGND_PRIO)))
-		rq->curr->prio--;
-	restart_promotions(rq);
-out_unlock:
-	spin_unlock(&rq->lock);
-}
-
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+#define EXPIRED_STARVING(rq) \
+	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+		(jiffies - (rq)->expired_timestamp >= \
+			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+			((rq)->curr->static_prio > (rq)->best_expired_prio))
+
+/*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
@@ -2743,11 +2235,10 @@ void scheduler_tick(int user_ticks, int 
 {
 	int cpu = smp_processor_id();
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	runqueue_t *rq = this_rq();
 	task_t *p = current;
-	unsigned long decayed_avg_nr_running;
-	unsigned long long now;
 
-	now = p->rq->timestamp_last_tick = sched_clock();
+	rq->timestamp_last_tick = sched_clock();
 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
@@ -2761,24 +2252,14 @@ void scheduler_tick(int user_ticks, int 
 		sys_ticks = 0;
 	}
 
-	/* this has to be done regardless of task type but hold lock for the
-	 * minimum possible time
-	 */
-	decayed_avg_nr_running = SCHED_AVG_MUL(p->rq->avg_nr_running, SCHED_AVG_ALPHA);
-	spin_lock(&p->rq->lock);
-	p->rq->avg_nr_running = decayed_avg_nr_running + p->rq->nr_running;
-	if ((sched_mode == SCHED_MODE_ENTITLEMENT_BASED) && (!--p->rq->eb_ticks_to_decay))
-		decay_eb_yardstick(p->rq);
-	spin_unlock(&p->rq->lock);
-
-	if (is_idle_task(p)) {
-		if (atomic_read(&p->rq->nr_iowait) > 0)
+	if (p == rq->idle) {
+		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait += sys_ticks;
 		else
 			cpustat->idle += sys_ticks;
-		if (wake_priority_sleeper(p->rq))
+		if (wake_priority_sleeper(rq))
 			goto out;
-		rebalance_tick(cpu, p->rq, SCHED_IDLE);
+		rebalance_tick(cpu, rq, SCHED_IDLE);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
@@ -2787,38 +2268,82 @@ void scheduler_tick(int user_ticks, int 
 		cpustat->user += user_ticks;
 	cpustat->system += sys_ticks;
 
+	/* Task might have expired already, but not scheduled off yet */
+	if (p->array != rq->active) {
+		set_tsk_need_resched(p);
+		goto out;
+	}
+	spin_lock(&rq->lock);
 	/*
-	 * SCHED_FIFO tasks never run out of timeslice.
+	 * The task was running during this tick - update the
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice. This makes it possible for interactive tasks
+	 * to use up their timeslices at their highest priority levels.
 	 */
-	if (unlikely(p->policy == SCHED_FIFO))
-		goto out;
-
-	spin_lock(&p->rq->lock);
+	if (rt_task(p)) {
+		/*
+		 * RR tasks need a special form of timeslice management.
+		 * FIFO tasks have no timeslices.
+		 */
+		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+			p->time_slice = task_timeslice(p);
+			p->first_time_slice = 0;
+			set_tsk_need_resched(p);
+
+			/* put it at the end of the queue: */
+			dequeue_task(p, rq->active);
+			enqueue_task(p, rq->active);
+		}
+		goto out_unlock;
+	}
 	if (!--p->time_slice) {
-		dequeue_task(p);
+		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
-		if (likely(p->policy != SCHED_RR)) {
-			delta_cpu_stats(p, now);
-			decay_avgs_and_calculate_rates(p);
-			recalc_throughput_bonus(p);
-			reassess_cpu_boundness(p);
-			/*
-			 * Arguably the interactive bonus should be updated here
-			 * as well.  But depends on whether we wish to encourage
-			 * interactive tasks to maintain a high bonus or CPU bound
-			 * tasks to lose some of there bonus?
-			 */
-			calculate_pre_bonus_priority(p);
+		p->prio = effective_prio(p);
+		p->time_slice = task_timeslice(p);
+		p->first_time_slice = 0;
+
+		if (!rq->expired_timestamp)
+			rq->expired_timestamp = jiffies;
+		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+			enqueue_task(p, rq->expired);
+			if (p->static_prio < rq->best_expired_prio)
+				rq->best_expired_prio = p->static_prio;
+		} else
+			enqueue_task(p, rq->active);
+	} else {
+		/*
+		 * Prevent a too long timeslice allowing a task to monopolize
+		 * the CPU. We do this by splitting up the timeslice into
+		 * smaller pieces.
+		 *
+		 * Note: this does not mean the task's timeslices expire or
+		 * get lost in any way, they just might be preempted by
+		 * another task of equal priority. (one with higher
+		 * priority would have preempted this task already.) We
+		 * requeue this task to the end of the list on this priority
+		 * level, which is in essence a round-robin of tasks with
+		 * equal priority.
+		 *
+		 * This only applies to tasks in the interactive
+		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+		 */
+		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+			(p->array == rq->active)) {
+
+			dequeue_task(p, rq->active);
+			set_tsk_need_resched(p);
 			p->prio = effective_prio(p);
+			enqueue_task(p, rq->active);
 		}
-		p->time_slice = task_timeslice(p);
-		enqueue_task(p);
 	}
-	spin_unlock(&p->rq->lock);
+out_unlock:
+	spin_unlock(&rq->lock);
 out:
-	rebalance_tick(cpu, p->rq, NOT_IDLE);
-	if (unlikely(promotions_due(p->rq)))
-		do_promotions(p->rq);
+	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -2871,7 +2396,8 @@ static inline int dependent_sleeper(int 
 {
 	struct sched_domain *sd = this_rq->sd;
 	cpumask_t sibling_map;
-	int ret = 0, i, idx;
+	prio_array_t *array;
+	int ret = 0, i;
 	task_t *p;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
@@ -2893,11 +2419,13 @@ static inline int dependent_sleeper(int 
 	 */
 	if (!this_rq->nr_running)
 		goto out_unlock;
+	array = this_rq->active;
+	if (!array->nr_active)
+		array = this_rq->expired;
+	BUG_ON(!array->nr_active);
 
-	idx = sched_find_first_bit(this_rq->bitmap);
-	p = list_entry(this_rq->queues[idx].queue.next, task_t, run_list);
-	/* update prio in case p has been promoted since it was queued */
-	p->prio = idx;
+	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
+		task_t, run_list);
 
 	for_each_cpu_mask(i, sibling_map) {
 		runqueue_t *smt_rq = cpu_rq(i);
@@ -2932,16 +2460,6 @@ out_unlock:
 		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
-
-static inline int recheck_needs_idle_balance(const runqueue_t *rq)
-{
-	return rq->nr_running == 0;
-}
-
-static inline int dependent_idle(const runqueue_t *rq)
-{
-	return rq->nr_running == 0;
-}
 #else
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
@@ -2951,16 +2469,6 @@ static inline int dependent_sleeper(int 
 {
 	return 0;
 }
-
-static inline int recheck_needs_idle_balance(const runqueue_t *rq)
-{
-	return 0;
-}
-
-static inline int dependent_idle(const runqueue_t *rq)
-{
-	return 0;
-}
 #endif
 
 /*
@@ -2971,7 +2479,10 @@ asmlinkage void __sched schedule(void)
 	long *switch_count;
 	task_t *prev, *next;
 	runqueue_t *rq;
+	prio_array_t *array;
+	struct list_head *queue;
 	unsigned long long now;
+	unsigned long run_time;
 	int cpu, idx;
 
 	/*
@@ -2991,7 +2502,7 @@ asmlinkage void __sched schedule(void)
 need_resched:
 	preempt_disable();
 	prev = current;
-	rq = prev->rq;
+	rq = this_rq();
 
 	/*
 	 * The idle thread is not allowed to schedule!
@@ -3005,6 +2516,18 @@ need_resched:
 	release_kernel_lock(prev);
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
+	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
+		run_time = now - prev->timestamp;
+	else
+		run_time = NS_MAX_SLEEP_AVG;
+
+	/*
+	 * Tasks with interactive credits get charged less run_time
+	 * at high sleep_avg to delay them losing their interactive
+	 * status
+	 */
+	if (HIGH_CREDIT(prev))
+		run_time /= (CURRENT_BONUS(prev) ? : 1);
 
 	spin_lock_irq(&rq->lock);
 
@@ -3019,25 +2542,26 @@ need_resched:
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else
-			deactivate_task(prev);
+			deactivate_task(prev, rq);
 	}
 
-	if (unlikely(task_needs_sinbinning(prev)))
-		put_task_in_sinbin(prev);
-
 	cpu = smp_processor_id();
-	if (unlikely(needs_idle_balance(rq))) {
+	if (unlikely(!rq->nr_running)) {
 go_idle:
 		idle_balance(cpu, rq);
-		/* This code should get optimised away when CONFIG_SCHED_SMT
-		 * is not defined
-		 */
-		if (dependent_idle(rq))
+		if (!rq->nr_running) {
+			next = rq->idle;
+			rq->expired_timestamp = 0;
 			wake_sleeping_dependent(cpu, rq);
+			/*
+			 * wake_sleeping_dependent() might have released
+			 * the runqueue, so break out if we got new
+			 * tasks meanwhile:
+			 */
+			if (!rq->nr_running)
+				goto switch_tasks;
+		}
 	} else {
-		/* This code should all get optimised away when CONFIG_SCHED_SMT
-		 * is not defined
-		 */
 		if (dependent_sleeper(cpu, rq)) {
 			schedstat_inc(rq, sched_goidle);
 			next = rq->idle;
@@ -3048,29 +2572,55 @@ go_idle:
 		 * lock, hence go into the idle loop if the rq went
 		 * empty meanwhile:
 		 */
-		if (unlikely(recheck_needs_idle_balance(rq)))
+		if (unlikely(!rq->nr_running))
 			goto go_idle;
 	}
 
-	schedstat_inc(rq, sched_noswitch);
-	idx = sched_find_first_bit(rq->bitmap);
-	next = list_entry(rq->queues[idx].queue.next, task_t, run_list);
-	/*
-	 * update prio just in case next has been promoted since it was queued
-	 */
-	next->prio = idx;
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
+		/*
+		 * Switch the active and expired arrays.
+		 */
+		schedstat_inc(rq, sched_switch);
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+		rq->expired_timestamp = 0;
+		rq->best_expired_prio = MAX_PRIO;
+	} else
+		schedstat_inc(rq, sched_noswitch);
 
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+	next = list_entry(queue->next, task_t, run_list);
+
+	if (!rt_task(next) && next->activated > 0) {
+		unsigned long long delta = now - next->timestamp;
+
+		if (next->activated == 1)
+			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+
+		array = next->array;
+		dequeue_task(next, array);
+		recalc_task_prio(next, next->timestamp + delta);
+		enqueue_task(next, array);
+	}
+	next->activated = 0;
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
-	delta_cpu_stats(prev, now);
+	prev->sleep_avg -= run_time;
+	if ((long)prev->sleep_avg <= 0) {
+		prev->sleep_avg = 0;
+		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
+			prev->interactive_credit--;
+	}
 	prev->timestamp = now;
 
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
-		delta_delay_stats(next, now);
 		next->timestamp = now;
 		rq->nr_switches++;
 		rq->curr = next;
@@ -3333,7 +2883,9 @@ EXPORT_SYMBOL(sleep_on_timeout);
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
-	spinlock_t *rql;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int old_prio, new_prio, delta;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -3341,32 +2893,38 @@ void set_user_nice(task_t *p, long nice)
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rql = task_rq_lock(p, &flags);
-
-	p->static_prio = NICE_TO_PRIO(nice);
-	p->eb_shares = nice_to_shares(nice);
+	rq = task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * not SCHED_NORMAL:
 	 */
-	if (!rt_task(p) && task_queued(p)) {
-		int delta = -p->prio;
+	if (rt_task(p)) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		goto out_unlock;
+	}
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+
+	old_prio = p->prio;
+	new_prio = NICE_TO_PRIO(nice);
+	delta = new_prio - old_prio;
+	p->static_prio = NICE_TO_PRIO(nice);
+	p->prio += delta;
 
-		dequeue_task(p);
-		calculate_pre_bonus_priority(p);
-		delta += p->prio = effective_prio(p);
-		enqueue_task(p);
+	if (array) {
+		enqueue_task(p, array);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
-		if (delta < 0 || (delta > 0 && task_is_running(p)))
-			resched_task(p->rq->curr);
+		if (delta < 0 || (delta > 0 && task_running(rq, p)))
+			resched_task(rq->curr);
 	}
-
-	task_rq_unlock(rql, &flags);
+out_unlock:
+	task_rq_unlock(rq, &flags);
 }
 
 EXPORT_SYMBOL(set_user_nice);
@@ -3422,158 +2980,6 @@ asmlinkage long sys_nice(int increment)
 
 #endif
 
-/*
- * Require: 0 <= new_cap <= PROPORTION_ONE
- */
-int set_cpu_rate_cap(struct task_struct *p, unsigned long long new_cap)
-{
-	int is_allowed;
-	unsigned long flags;
-	spinlock_t *rql;
-	long long delta;
-
-	if (new_cap > PROPORTION_ONE)
-		return -EINVAL;
-	is_allowed = capable(CAP_SYS_NICE);
-	/*
-	 * We have to be careful, if called from /proc code,
-	 * the task might be in the middle of scheduling on another CPU.
-	 */
-	rql = task_rq_lock(p, &flags);
-	delta = new_cap - p->cpu_rate_cap;
-	if (!is_allowed) {
-		/*
-		 * Ordinary users can set/change caps on their own tasks provided
-		 * that the new setting is MORE constraining
-		 */
-		if (((current->euid != p->uid) && (current->uid != p->uid)) || (delta > 0)) {
-			task_rq_unlock(rql, &flags);
-			return -EPERM;
-		}
-	}
-	/*
-	 * The RT tasks don't have caps, but we still allow the caps to be
-	 * set - but as expected it wont have any effect on scheduling until the
-	 * task becomes SCHED_NORMAL:
-	 */
-	p->cpu_rate_cap = new_cap;
-	if (!rt_task(p) && task_queued(p)) {
-		int delta = -p->prio;
-
-		dequeue_task(p);
-		calculate_pre_bonus_priority(p);
-		delta += p->prio = effective_prio(p);
-		enqueue_task(p);
-		/*
-		 * If the task increased its priority or is running and
-		 * lowered its priority, then reschedule its CPU:
-		 */
-		if (delta < 0 || (delta > 0 && task_is_running(p)))
-			resched_task(p->rq->curr);
-	}
-	task_rq_unlock(rql, &flags);
-	return 0;
-}
-
-EXPORT_SYMBOL(set_cpu_rate_cap);
-
-/*
- * Require: 1 <= new_cap <= PROPORTION_ONE
- */
-int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long long new_cap)
-{
-	int is_allowed;
-	unsigned long flags;
-	spinlock_t *rql;
-	long long delta;
-
-	if ((new_cap > PROPORTION_ONE) || (new_cap == 0)) /* zero hard caps are not allowed */
-		return -EINVAL;
-	is_allowed = capable(CAP_SYS_NICE);
-	/*
-	 * We have to be careful, if called from /proc code,
-	 * the task might be in the middle of scheduling on another CPU.
-	 */
-	rql = task_rq_lock(p, &flags);
-	delta = new_cap - p->cpu_rate_hard_cap;
-	if (!is_allowed) {
-		/*
-		 * Ordinary users can set/change caps on their own tasks provided
-		 * that the new setting is MORE constraining
-		 */
-		if (((current->euid != p->uid) && (current->uid != p->uid)) || (delta > 0)) {
-			task_rq_unlock(rql, &flags);
-			return -EPERM;
-		}
-	}
-	/*
-	 * The RT tasks don't have caps, but we still allow the caps to be
-	 * set - but as expected it wont have any effect on scheduling until the
-	 * task becomes SCHED_NORMAL:
-	 */
-	p->cpu_rate_hard_cap = new_cap;
-	/* (POSSIBLY) TODO: if it's sinbinned and the cap is relaxed then release
-	 *  it from the sinbin
-	 */
-	task_rq_unlock(rql, &flags);
-	return 0;
-}
-
-EXPORT_SYMBOL(set_cpu_rate_hard_cap);
-
-int set_cpu_shares(task_t *p, unsigned int new_shares)
-{
-	int is_allowed;
-	int result = 0;
-	unsigned long flags;
-	spinlock_t *rql;
-
-	if (p->eb_shares == new_shares)
-		return 0;
-
-	if ((new_shares < 1) || (new_shares > MAX_EB_SHARES))
-		return -EINVAL;
-
-	is_allowed = capable(CAP_SYS_NICE);
-	/*
-	 * We have to be careful, if called from sys_setpriority(),
-	 * the task might be in the middle of scheduling on another CPU.
-	 */
-	rql = task_rq_lock(p, &flags);
-	if (!is_allowed && (new_shares > p->eb_shares)) {
-		result = -EPERM;
-		goto out_unlock;
-	}
-	p->static_prio = NICE_TO_PRIO(shares_to_nice(new_shares));
-	p->eb_shares = new_shares;
-	/*
-	 * The RT priorities are set via setscheduler(), but we still
-	 * allow eb_shares value to be set - but as expected
-	 * it wont have any effect on scheduling until the task is
-	 * not SCHED_NORMAL:
-	 */
-	if (!rt_task(p) && task_queued(p)) {
-		int delta = -p->prio;
-
-		dequeue_task(p);
-		calculate_pre_bonus_priority(p);
-		delta += p->prio = effective_prio(p);
-		enqueue_task(p);
-		/*
-		 * If the task decreased its prio or is running and
-		 * increased its prio, then reschedule its CPU:
-		 */
-		if (delta < 0 || (delta > 0 && task_is_running(p)))
-			resched_task(p->rq->curr);
-	}
-out_unlock:
-	task_rq_unlock(rql, &flags);
-
-	return result;
-}
-
-EXPORT_SYMBOL(set_cpu_shares);
-
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
@@ -3621,7 +3027,7 @@ static inline task_t *find_process_by_pi
 /* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
-	BUG_ON(task_queued(p));
+	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
 	if (policy != SCHED_NORMAL)
@@ -3638,9 +3044,9 @@ static int setscheduler(pid_t pid, int p
 	struct sched_param lp;
 	int retval = -EINVAL;
 	int oldprio;
-	int queued;
+	prio_array_t *array;
 	unsigned long flags;
-	spinlock_t *rql;
+	runqueue_t *rq;
 	task_t *p;
 
 	if (!param || pid < 0)
@@ -3665,7 +3071,7 @@ static int setscheduler(pid_t pid, int p
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
-	rql = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 
 	if (policy < 0)
 		policy = p->policy;
@@ -3689,42 +3095,38 @@ static int setscheduler(pid_t pid, int p
 
 	retval = -EPERM;
 	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
-	    !capable(CAP_SYS_NICE)) {
-		if (current->euid == p->uid)
-			p->flags |= PF_UNPRIV_RT;
+	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
-	}
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
-	if (policy == SCHED_NORMAL)
-		p->flags &= ~PF_UNPRIV_RT;
 
 	retval = security_task_setscheduler(p, policy, &lp);
 	if (retval)
 		goto out_unlock;
 
-	if ((queued = task_queued(p)))
-		deactivate_task(p);
+	array = p->array;
+	if (array)
+		deactivate_task(p, task_rq(p));
 	retval = 0;
 	oldprio = p->prio;
 	__setscheduler(p, policy, lp.sched_priority);
-	if (queued) {
-		__activate_task(p);
+	if (array) {
+		__activate_task(p, task_rq(p));
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_is_running(p)) {
+		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
-				resched_task(p);
-		} else
-			preempt_curr_if_warranted(p);
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
 	}
 
 out_unlock:
-	task_rq_unlock(rql, &flags);
+	task_rq_unlock(rq, &flags);
 out_unlock_tasklist:
 	read_unlock_irq(&tasklist_lock);
 
@@ -3954,130 +3356,45 @@ asmlinkage long sys_sched_getaffinity(pi
 	return sizeof(cpumask_t);
 }
 
-void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats)
-{
-	int on_runq = 0;
-	int on_cpu = 0;
-	int sinbinned = 0;
-	unsigned long long timestamp;
-	unsigned long flags;
-	spinlock_t *rql = task_rq_lock(tsk, &flags);
-
-	stats->timestamp = tsk->rq->timestamp_last_tick;
-	stats->cycle_count = tsk->cycle_count;
-	stats->total_sleep = tsk->total_sleep;
-	stats->total_cpu = tsk->total_cpu;
-	stats->total_delay = tsk->total_delay;
-	stats->total_sinbin = tsk->total_sinbin;
-	stats->intr_wake_ups = tsk->intr_wake_ups;
-	timestamp = tsk->sched_timestamp;
-	if ((on_runq = task_queued(tsk)))
-		on_cpu = task_is_running(tsk);
-	else
-		sinbinned = task_is_sinbinned(tsk);
-
-	task_rq_unlock(rql, &flags);
-
-	/*
-	 * Update values to the previous tick (only)
-	 */
-	if (stats->timestamp > timestamp) {
-		unsigned long long delta = stats->timestamp - timestamp;
-
-		if (on_cpu) {
-			stats->total_cpu += delta;
-		} else if (on_runq || sinbinned) {
-			stats->total_delay += delta;
-			if (sinbinned)
-				stats->total_sinbin += delta;
-		} else {
-			stats->total_sleep += delta;
-		}
-	}
-}
-
-EXPORT_SYMBOL(get_task_sched_stats);
-
-/*
- * Get scheduling statistics for the nominated CPU
- */
-void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats)
-{
-	int idle;
-	unsigned long long idle_timestamp;
-	runqueue_t *rq = cpu_rq(cpu);
-
-	/*
-	 * No need to crash the whole machine if they've asked for stats for
-	 * a non existent CPU, just send back zero.
-	 */
-	if (rq == NULL) {
-		stats->timestamp = 0;
-		stats->total_idle = 0;
-		stats->total_busy = 0;
-		stats->total_delay = 0;
-		stats->total_sinbin = 0;
-		stats->nr_switches = 0;
-
-		return;
-	}
-	local_irq_disable();
-	spin_lock(&rq->lock);
-	idle = rq->curr == rq->idle;
-	stats->timestamp = rq->timestamp_last_tick;
-	idle_timestamp = rq->idle->sched_timestamp;
-	stats->total_idle = rq->idle->total_cpu;
-	stats->total_busy = rq->idle->total_delay;
-	stats->total_delay = rq->total_delay;
-	stats->total_sinbin = rq->total_sinbin;
-	stats->nr_switches = rq->nr_switches;
-	spin_unlock_irq(&rq->lock);
-
-	/*
-	 * Update idle/busy time to the current tick
-	 */
-	if (idle)
-		stats->total_idle += (stats->timestamp - idle_timestamp);
-	else
-		stats->total_busy += (stats->timestamp - idle_timestamp);
-}
-
-EXPORT_SYMBOL(get_cpu_sched_stats);
-
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
+ * this function yields the current CPU by moving the calling thread
+ * to the expired array. If there are no other threads running on this
  * CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
-	spinlock_t *rql = this_rq_lock();
+	runqueue_t *rq = this_rq_lock();
+	prio_array_t *array = current->array;
+	prio_array_t *target = rq->expired;
+
+	schedstat_inc(rq, yld_cnt);
+	/*
+	 * We implement yielding by moving the task into the expired
+	 * queue.
+	 *
+	 * (special rule: RT tasks will just roundrobin in the active
+	 *  array.)
+	 */
+	if (rt_task(current))
+		target = rq->active;
+
+	if (current->array->nr_active == 1) {
+		schedstat_inc(rq, yld_act_empty);
+		if (!rq->expired->nr_active)
+			schedstat_inc(rq, yld_both_empty);
+	} else if (!rq->expired->nr_active)
+		schedstat_inc(rq, yld_exp_empty);
 
-	schedstat_inc(current->rq, yld_cnt);
-	/* If there's other tasks on this CPU make sure that at least
-	 * one of them get some CPU before this task's next bite of the
-	 * cherry.  Dequeue before looking for the appropriate run
-	 * queue so that we don't find our queue if we were the sole
-	 * occupant of that queue.
-	 */
-	dequeue_task(current);
-	/*
-	 * special rule: RT tasks will just roundrobin.
-	 */
-	if (likely(!rt_task(current))) {
-		int idx = find_next_bit(current->rq->bitmap, IDLE_PRIO, current->prio);
-		if (idx < IDLE_PRIO)
-			current->prio = idx;
-	}
-	enqueue_task(current);
-	if (current->rq->nr_running == 1)
-		schedstat_inc(current->rq, yld_both_empty);
+	dequeue_task(current, array);
+	enqueue_task(current, target);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	_raw_spin_unlock(rql);
+	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 
 	schedule();
@@ -4381,25 +3698,15 @@ void __devinit init_idle(task_t *idle, i
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	idle->prio = IDLE_PRIO;
-	/*
-	 * Initialize scheduling statistics counters as they may provide
-	 * valuable about the CPU e.g. avg_cpu_time_per_cycle for the idle
-	 * task will be an estimate of the average time the CPU is idle
-	 */
-	initialize_stats(idle);
-	initialize_bonuses(idle);
+	idle->sleep_avg = 0;
+	idle->interactive_credit = 0;
+	idle->array = NULL;
+	idle->prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	set_task_cpu(idle, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
-	idle->sched_timestamp = adjusted_sched_clock(idle);
-	/*
-	 * Putting the idle process onto a run queue simplifies the selection of
-	 * the next task to run in schedule().
-	 */
-	enqueue_task(idle);
 	set_tsk_need_resched(idle);
 	spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -4451,11 +3758,11 @@ int set_cpus_allowed(task_t *p, cpumask_
 	unsigned long flags;
 	int ret = 0;
 	migration_req_t req;
-	spinlock_t *rql;
+	runqueue_t *rq;
 
 	perfctr_set_cpus_allowed(p, new_mask);
 
-	rql = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &flags);
 	if (!cpus_intersects(new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
@@ -4468,14 +3775,14 @@ int set_cpus_allowed(task_t *p, cpumask_
 
 	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rql, &flags);
-		wake_up_process(p->rq->migration_thread);
+		task_rq_unlock(rq, &flags);
+		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
-	task_rq_unlock(rql, &flags);
+	task_rq_unlock(rq, &flags);
 	return ret;
 }
 
@@ -4508,25 +3815,21 @@ static void __migrate_task(struct task_s
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
-	if (task_queued(p)) {
-		/*
-		 * Don't do set_task_cpu() until AFTER we dequeue the task,
-		 * since dequeue_task() relies on p->rq always being accurate.
-		 */
-		deactivate_task(p);
-		delta_delay_stats(p, adjusted_sched_clock(p));
-		set_task_cpu(p, dest_cpu);
+	set_task_cpu(p, dest_cpu);
+	if (p->array) {
 		/*
-		 *  activate_task() will set the timestamp correctly so there's
-		 *  no need to adjust it here
+		 * Sync timestamp with rq_dest's before activating.
+		 * The same thing could be achieved by doing this step
+		 * afterwards, and pretending it was a local activate.
+		 * This way is cleaner and logically correct.
 		 */
-		activate_task(p);
-		preempt_curr_if_warranted(p);
-	} else {
-		delta_sleep_stats(p, adjusted_sched_clock(p));
-		set_task_cpu(p, dest_cpu);
+		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+				+ rq_dest->timestamp_last_tick;
+		deactivate_task(p, rq_src);
+		activate_task(p, rq_dest, 0);
+		if (TASK_PREEMPTS_CURR(p, rq_dest))
+			resched_task(rq_dest->curr);
 	}
-	adjust_sched_timestamp(p, rq_src);
 
 out:
 	double_rq_unlock(rq_src, rq_dest);
@@ -4676,10 +3979,9 @@ void sched_idle_next(void)
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 
-	dequeue_task(p);
 	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 	/* Add idle task to _front_ of it's priority queue */
-	__activate_task_head(p);
+	__activate_idle_task(p, rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -4711,13 +4013,17 @@ static void migrate_dead(unsigned int de
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
-	unsigned i;
+	unsigned arr, i;
 	struct runqueue *rq = cpu_rq(dead_cpu);
 
-	for (i = 0; i < IDLE_PRIO; i++) {
-		struct list_head *list = &rq->queues[i].queue;
-		while (!list_empty(list))
-			migrate_dead(dead_cpu, list_entry(list->next, task_t, run_list));
+	for (arr = 0; arr < 2; arr++) {
+		for (i = 0; i < MAX_PRIO; i++) {
+			struct list_head *list = &rq->arrays[arr].queue[i];
+			while (!list_empty(list))
+				migrate_dead(dead_cpu,
+					     list_entry(list->next, task_t,
+							run_list));
+		}
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -4731,10 +4037,7 @@ static int migration_call(struct notifie
 {
 	int cpu = (long)hcpu;
 	struct task_struct *p;
-#ifdef CONFIG_HOTPLUG_CPU
 	struct runqueue *rq;
-#endif
-	spinlock_t *rql;
 	unsigned long flags;
 
 	switch (action) {
@@ -4745,9 +4048,9 @@ static int migration_call(struct notifie
 		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
-		rql = task_rq_lock(p, &flags);
+		rq = task_rq_lock(p, &flags);
 		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
-		task_rq_unlock(rql, &flags);
+		task_rq_unlock(rq, &flags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 	case CPU_ONLINE:
@@ -4766,14 +4069,13 @@ static int migration_call(struct notifie
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
-		/* Idle task back to normal in IDLE_PRIO slot */
-		rql = task_rq_lock(rq->idle, &flags);
-		deactivate_task(rq->idle);
-		rq->idle->static_prio = IDLE_PRIO;
+		/* Idle task back to normal (off runqueue, low prio) */
+		rq = task_rq_lock(rq->idle, &flags);
+		deactivate_task(rq->idle, rq);
+		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq->idle, SCHED_NORMAL, 0);
-		enqueue_task(rq->idle);
 		migrate_dead_tasks(cpu);
-		task_rq_unlock(rql, &flags);
+		task_rq_unlock(rq, &flags);
 		BUG_ON(rq->nr_running != 0);
 
 		/* No need to migrate the tasks: it was best-effort if
@@ -5254,11 +4556,16 @@ int in_sched_functions(unsigned long add
 void __init sched_init(void)
 {
 	runqueue_t *rq;
-	int i, k;
+	int i, j, k;
 
 	for (i = 0; i < NR_CPUS; i++) {
+		prio_array_t *array;
+
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
+		rq->active = rq->arrays;
+		rq->expired = rq->arrays + 1;
+		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_dummy;
@@ -5270,24 +4577,16 @@ void __init sched_init(void)
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
-		for (k = 0; k <= IDLE_PRIO; k++) {
-			rq->queues[k].prio = k;
-			INIT_LIST_HEAD(&rq->queues[k].queue);
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
 		}
-		bitmap_zero(rq->bitmap, NUM_PRIO_SLOTS);
-		/* delimiter for bitsearch */
-		__set_bit(IDLE_PRIO, rq->bitmap);
-		rq->timestamp_last_tick = 0;
-		rq->next_prom_due = ULONG_MAX;
-		rq->pcount = 0;
-		rq->total_delay = 0;
-		rq->eb_yardstick = 0;
-		rq->eb_ticks_to_decay = time_slice_ticks;
-		rq->avg_nr_running = 0;
-		rq->total_sinbin = 0;
 	}
-	current->rq = this_rq();
-	current->sched_timestamp = sched_clock();
 
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
@@ -5325,256 +4624,3 @@ void __might_sleep(char *file, int line)
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
-
-#if defined(CONFIG_SYSCTL)
-/*
- * CPU scheduler control via /proc/sys/cpusched/xxx
- */
-enum
-{
-	CPU_SCHED_END_OF_LIST=0,
-	CPU_SCHED_TIME_SLICE=1,
-	CPU_SCHED_SCHED_RR_TIME_SLICE,
-	CPU_SCHED_BASE_PROMOTION_INTERVAL,
-	CPU_SCHED_MAX_IA_BONUS,
-	CPU_SCHED_MAX_TPT_BONUS,
-	CPU_SCHED_IA_THRESHOLD,
-	CPU_SCHED_CPU_HOG_THRESHOLD,
-	CPU_SCHED_LOG_AT_EXIT,
-	CPU_SCHED_MODE,
-	CPU_SCHED_INITIAL_IA_BONUS,
-	CPU_SCHED_UNPRIV_RT_THRESHOLD,
-	CPU_SCHED_BGND_TIME_SLICE_MULTIPLIER
-};
-
-static const unsigned int zero = 0;
-static const unsigned int one = 1;
-#define min_milli_value zero
-static const unsigned int max_milli_value = 1000;
-#define min_max_ia_bonus zero
-static const unsigned int max_max_ia_bonus = MAX_MAX_IA_BONUS;
-#define min_max_tpt_bonus zero
-static const unsigned int max_max_tpt_bonus = MAX_MAX_TPT_BONUS;
-static unsigned int time_slice_msecs = DEFAULT_TIME_SLICE_MSECS;
-static unsigned int sched_rr_time_slice_msecs = DEFAULT_TIME_SLICE_MSECS;
-#define min_time_slice_msecs one
-static const unsigned int max_time_slice_msecs = MAX_TIME_SLICE_MSECS;
-static unsigned int base_prom_interval_msecs = BASE_PROM_INTERVAL_MSECS;
-#define min_base_prom_interval_msecs one
-static const unsigned int max_base_prom_interval_msecs = INT_MAX;
-#define min_sched_bgnd_time_slice_multiplier one
-static const unsigned int max_sched_bgnd_time_slice_multiplier = 100;
-
-static int proc_time_slice_msecs(ctl_table *ctp, int write, struct file *fp,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos);
-
-	if ((res == 0) && write)
-		time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(time_slice_msecs);
-
-	return res;
-}
-
-static int proc_sched_rr_time_slice_msecs(ctl_table *ctp, int write, struct file *fp,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos);
-
-	if ((res == 0) && write)
-		sched_rr_time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(sched_rr_time_slice_msecs);
-
-	return res;
-}
-
-static int proc_base_prom_interval_msecs(ctl_table *ctp, int write, struct file *fp,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos);
-
-	if ((res == 0) && write)
-		base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(base_prom_interval_msecs);
-
-	return res;
-}
-
-static int proc_cpu_hog_threshold(ctl_table *ctp, int write, struct file *fp,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos);
-
-	if ((res == 0) && write)
-		cpu_hog_threshold = calc_proportion(cpu_hog_threshold_ppt, 1000);
-
-	return res;
-}
-
-static int proc_ia_threshold(ctl_table *ctp, int write, struct file *fp,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos);
-
-	if ((res == 0) && write)
-		ia_threshold = calc_proportion(ia_threshold_ppt, 1000);
-
-	return res;
-}
-
-static int proc_unpriv_rt_threshold(ctl_table *ctp, int write, struct file *fp,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp, ppos);
-
-	if ((res == 0) && write)
-		unpriv_rt_threshold = calc_proportion(unpriv_rt_threshold_ppt, 1000);
-
-	return res;
-}
-
-#define SCHED_MODE_BUFFER_LEN 16
-static char current_sched_mode[SCHED_MODE_BUFFER_LEN] = "";
-static int proc_sched_mode(ctl_table *ctp, int write, struct file *fp,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int res;
-
-	strcpy(current_sched_mode, sched_mode_names[sched_mode]);
-	res = proc_dostring(ctp, write, fp, buffer, lenp, ppos);
-
-	if ((res == 0) && write) {
-		int i;
-
-		for (i = 0; sched_mode_names[i] != NULL; i++)
-			if (strcmp(current_sched_mode, sched_mode_names[i]) == 0)
-				break;
-		if (sched_mode_names[i] == NULL)
-			res = -EINVAL;
-		else /* set the scheduling mode */
-			sched_mode = i;
-	}
-
-	return res;
-}
-
-ctl_table cpu_sched_table[] = {
-	{
-		.ctl_name	= CPU_SCHED_TIME_SLICE,
-		.procname	= "time_slice",
-		.data		= &time_slice_msecs,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_time_slice_msecs,
-		.extra1		= (void *)&min_time_slice_msecs,
-		.extra2		= (void *)&max_time_slice_msecs
-	},
-	{
-		.ctl_name	= CPU_SCHED_SCHED_RR_TIME_SLICE,
-		.procname	= "sched_rr_time_slice",
-		.data		= &sched_rr_time_slice_msecs,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_sched_rr_time_slice_msecs,
-		.extra1		= (void *)&min_time_slice_msecs,
-		.extra2		= (void *)&max_time_slice_msecs
-	},
-	{
-		.ctl_name	= CPU_SCHED_BASE_PROMOTION_INTERVAL,
-		.procname	= "base_promotion_interval",
-		.data		= &base_prom_interval_msecs,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_base_prom_interval_msecs,
-		.extra1		= (void *)&min_base_prom_interval_msecs,
-		.extra2		= (void *)&max_base_prom_interval_msecs
-	},
-	{
-		.ctl_name	= CPU_SCHED_MAX_IA_BONUS,
-		.procname	= "max_ia_bonus",
-		.data		= &max_ia_bonus,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= (void *)&min_max_ia_bonus,
-		.extra2		= (void *)&max_max_ia_bonus
-	},
-	{
-		.ctl_name	= CPU_SCHED_INITIAL_IA_BONUS,
-		.procname	= "initial_ia_bonus",
-		.data		= &initial_ia_bonus,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= (void *)&min_max_ia_bonus,
-		.extra2		= (void *)&max_max_ia_bonus
-	},
-	{
-		.ctl_name	= CPU_SCHED_MAX_TPT_BONUS,
-		.procname	= "max_tpt_bonus",
-		.data		= &max_tpt_bonus,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= (void *)&min_max_tpt_bonus,
-		.extra2		= (void *)&max_max_tpt_bonus
-	},
-	{
-		.ctl_name	= CPU_SCHED_IA_THRESHOLD,
-		.procname	= "ia_threshold",
-		.data		= &ia_threshold_ppt,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_ia_threshold,
-		.extra1		= (void *)&min_milli_value,
-		.extra2		= (void *)&max_milli_value
-	},
-	{
-		.ctl_name	= CPU_SCHED_CPU_HOG_THRESHOLD,
-		.procname	= "cpu_hog_threshold",
-		.data		= &cpu_hog_threshold_ppt,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_cpu_hog_threshold,
-		.extra1		= (void *)&min_milli_value,
-		.extra2		= (void *)&max_milli_value
-	},
-	{
-		.ctl_name	= CPU_SCHED_UNPRIV_RT_THRESHOLD,
-		.procname	= "unpriv_rt_threshold",
-		.data		= &unpriv_rt_threshold_ppt,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_unpriv_rt_threshold,
-		.extra1		= (void *)&min_milli_value,
-		.extra2		= (void *)&max_milli_value
-	},
-	{
-		.ctl_name	= CPU_SCHED_LOG_AT_EXIT,
-		.procname	= "log_at_exit",
-		.data		= &log_at_exit,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= (void *)&zero,
-		.extra2		= (void *)&one
-	},
-	{
-		.ctl_name	= CPU_SCHED_MODE,
-		.procname	= "mode",
-		.data		= &current_sched_mode,
-		.maxlen		= SCHED_MODE_BUFFER_LEN,
-		.mode		= 0644,
-		.proc_handler	= &proc_sched_mode,
-	},
-	{
-		.ctl_name	= CPU_SCHED_BGND_TIME_SLICE_MULTIPLIER,
-		.procname	= "bgnd_time_slice_multiplier",
-		.data		= &bgnd_time_slice_multiplier,
-		.maxlen		= sizeof (unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= (void *)&min_sched_bgnd_time_slice_multiplier,
-		.extra2		= (void *)&max_sched_bgnd_time_slice_multiplier
-	},
-	{ .ctl_name = CPU_SCHED_END_OF_LIST }
-};
-#endif
Index: linux-2.6.9-rc2-mm2/kernel/sysctl.c
===================================================================
--- linux-2.6.9-rc2-mm2.orig/kernel/sysctl.c	2004-09-23 09:59:23.611797968 +1000
+++ linux-2.6.9-rc2-mm2/kernel/sysctl.c	2004-09-23 10:00:41.397972672 +1000
@@ -149,10 +149,6 @@ extern ctl_table pty_table[];
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 int sysctl_legacy_va_layout;
 #endif
-/*
- * CPU scheduler control variables (lives in sched.c)
- */
-extern ctl_table cpu_sched_table[];
 
 /* /proc declarations: */
 
@@ -628,12 +624,6 @@ static ctl_table kern_table[] = {
 		.proc_handler   = &proc_unknown_nmi_panic,
 	},
 #endif
-	{
-		.ctl_name	= KERN_CPU_SCHED,
-		.procname	= "cpusched",
-		.mode		= 0555,
-		.child		= cpu_sched_table,
-	},
 	{ .ctl_name = 0 }
 };
 
