Index: linux-2.6.11-rc2-ck1/include/linux/init_task.h
===================================================================
--- linux-2.6.11-rc2-ck1.orig/include/linux/init_task.h	2005-01-27 10:12:53.593087737 +1100
+++ linux-2.6.11-rc2-ck1/include/linux/init_task.h	2005-01-27 10:46:52.719907233 +1100
@@ -80,6 +80,7 @@ extern struct group_info init_groups;
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.iso_list	= LIST_HEAD_INIT(tsk.iso_list),			\
 	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
Index: linux-2.6.11-rc2-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.11-rc2-ck1.orig/include/linux/sched.h	2005-01-27 10:12:53.594087576 +1100
+++ linux-2.6.11-rc2-ck1/include/linux/sched.h	2005-01-27 10:48:13.696914369 +1100
@@ -131,15 +131,25 @@ extern unsigned long nr_iowait(void);
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+#define SCHED_ISO_RR		4
+#define SCHED_ISO_FIFO		5
 
 #define SCHED_MIN		0
-#define SCHED_MAX		3
+#define SCHED_MAX		5
 
 #define SCHED_RANGE(policy)	((policy) >= SCHED_MIN && \
 					(policy) <= SCHED_MAX)
 #define SCHED_RT(policy)	((policy) == SCHED_FIFO || \
 					(policy) == SCHED_RR)
 					
+#define SCHED_ISO(policy)	((policy) == SCHED_ISO_RR || \
+				(policy) == SCHED_ISO_FIFO)
+
+/* The policies that support a real time priority setting */
+#define SCHED_RT_PRIO(policy)	(SCHED_RT(policy) || SCHED_ISO(policy))
+  
+extern int iso_cpu, iso_period;
+
 struct sched_param {
 	int sched_priority;
 };
@@ -365,10 +375,12 @@ struct signal_struct {
 
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define ISO_PRIO		(MAX_RT_PRIO - 1)
 
 #define MAX_PRIO		(MAX_RT_PRIO + 41)
 
-#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_task(p)		(unlikely((p)->prio < ISO_PRIO))
+#define iso_task(p)		(unlikely(SCHED_ISO((p)->policy)))
 #define batch_task(p)		((p)->policy == SCHED_BATCH)
 
 /*
@@ -546,6 +558,7 @@ struct task_struct {
 
 	int prio, static_prio;
 	struct list_head run_list;
+	struct list_head iso_list;
 
 	unsigned long long timestamp;
 	unsigned long runtime, totalrun, ns_debit;
Index: linux-2.6.11-rc2-ck1/include/linux/sysctl.h
===================================================================
--- linux-2.6.11-rc2-ck1.orig/include/linux/sysctl.h	2005-01-27 10:12:49.020821363 +1100
+++ linux-2.6.11-rc2-ck1/include/linux/sysctl.h	2005-01-27 10:46:53.262820122 +1100
@@ -137,6 +137,8 @@ enum
 	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
 	KERN_INTERACTIVE=68,	/* interactive tasks can have cpu bursts */
 	KERN_COMPUTE=69,	/* adjust timeslices for a compute server */
+	KERN_ISO_CPU=70,	/* int: cpu% allowed by SCHED_ISO class */
+	KERN_ISO_PERIOD=71,	/* int: seconds over which SCHED_ISO cpu is decayed */
 };
 
 
Index: linux-2.6.11-rc2-ck1/kernel/sched.c
===================================================================
--- linux-2.6.11-rc2-ck1.orig/kernel/sched.c	2005-01-27 10:12:53.597087095 +1100
+++ linux-2.6.11-rc2-ck1/kernel/sched.c	2005-01-27 10:42:16.059297782 +1100
@@ -84,10 +84,16 @@ int sched_compute = 0;
 #define _RR_INTERVAL		((10 * HZ / 1000) ? : 1)
 #define RR_INTERVAL()		(_RR_INTERVAL * (1 + 9 * sched_compute))
 
+int iso_cpu = 70;	/* The soft %cpu limit on SCHED_ISO tasks */
+int iso_period = 5;	/* The time over which SCHED_ISO cpu decays */
+
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp)	\
 				< (long long) (sd)->cache_hot_time)
 
 
+#define ISO_BITMAP_SIZE ((((MAX_USER_RT_PRIO+1+7)/8)+sizeof(long)-1)/ \
+			sizeof(long))
+
 /*
  * These are the runqueue data structures:
  */
@@ -112,6 +118,18 @@ struct runqueue {
 #ifdef CONFIG_SMP
 	unsigned long cpu_load;
 #endif
+	unsigned long iso_ticks;
+	unsigned long iso_running;
+	unsigned long iso_bitmap[ISO_BITMAP_SIZE];
+	struct list_head iso_queue[MAX_USER_RT_PRIO];
+	int iso_refractory;
+	/*
+	 * Refractory is the flag that we've hit the maximum iso cpu and are
+	 * in the refractory period where SCHED_ISO tasks can only run as
+	 * SCHED_NORMAL until their cpu usage drops to 90% of their iso_cpu
+	 * limit.
+	 */
+
 	unsigned long long nr_switches;
 
 	/*
@@ -490,21 +508,53 @@ static inline int task_queued(task_t *ta
 	return !list_empty(&task->run_list);
 }
 
+/* We invert the ISO rt_priorities for queueing order */
+#define iso_prio(p)	(ISO_PRIO - (p)->rt_priority)
+
+static void dequeue_iso_task(struct task_struct *p, runqueue_t *rq)
+{
+	rq->iso_running--;
+	list_del(&p->iso_list);
+	if (list_empty(rq->iso_queue + iso_prio(p)))
+		__clear_bit(iso_prio(p), rq->iso_bitmap);
+}
+
 /*
  * Adding/removing a task to/from a runqueue:
  */
 static void dequeue_task(struct task_struct *p, runqueue_t *rq)
 {
+	if (iso_task(p))
+		dequeue_iso_task(p, rq);
 	list_del_init(&p->run_list);
 	if (list_empty(rq->queue + p->prio))
 		__clear_bit(p->prio, rq->bitmap);
 	p->ns_debit = 0;
 }
 
+/*
+ * SCHED_ISO tasks are queued on both runqueues. Their actual priority is
+ * either better than SCHED_NORMAL if below starvation limits, or
+ * their underlying SCHED_NORMAL dynamic priority.
+ */
+static void enqueue_iso_task(struct task_struct *p, runqueue_t *rq)
+{
+	list_add_tail(&p->iso_list, rq->iso_queue + iso_prio(p));
+	__set_bit(iso_prio(p), rq->iso_bitmap);
+	rq->iso_running++;
+}
+
 static void enqueue_task(struct task_struct *p, runqueue_t *rq)
 {
 	list_add_tail(&p->run_list, rq->queue + p->prio);
 	__set_bit(p->prio, rq->bitmap);
+	if (iso_task(p))
+		enqueue_iso_task(p, rq);
+}
+
+static void requeue_iso_task(struct task_struct *p, runqueue_t *rq)
+{
+	list_move_tail(&p->iso_list, rq->iso_queue + iso_prio(p));
 }
 
 /*
@@ -514,6 +564,8 @@ static void enqueue_task(struct task_str
 static void requeue_task(struct task_struct *p, runqueue_t *rq)
 {
 	list_move_tail(&p->run_list, rq->queue + p->prio);
+	if (iso_task(p))
+		requeue_iso_task(p, rq);
 }
 
 static void enqueue_task_head(struct task_struct *p, runqueue_t *rq)
@@ -947,11 +999,28 @@ static int cache_delay = 10 * HZ / 1000;
  */
 static void preempt(task_t *p, runqueue_t *rq)
 {
-	if (p->prio > rq->curr->prio)
+	int p_prio = p->prio, curr_prio = rq->curr->prio;
+
+	if (!iso_task(p) && !iso_task(rq->curr))
+		goto out;
+	if (!rq->iso_refractory) {
+		if (iso_task(p)) {
+			if (iso_task(rq->curr)) {
+				p_prio = -p->rt_priority;
+				curr_prio = -rq->curr->rt_priority;
+				goto out;
+			}
+			p_prio = ISO_PRIO;
+		if (iso_task(rq->curr))
+			curr_prio = ISO_PRIO;
+		}
+	}
+out:
+	if (p_prio > curr_prio)
 		return;
-	if (p->prio == rq->curr->prio &&
+	if (p_prio == curr_prio &&
 		((p->totalrun || p->slice != slice(p)) ||
-		rt_task(rq->curr)))
+		rt_task(rq->curr) || iso_task(rq->curr)))
 			return;
 	if (!sched_compute || rq->cache_ticks >= cache_delay ||
 		!p->mm || rt_task(p))
@@ -1136,6 +1205,7 @@ void fastcall sched_fork(task_t *p)
 	 */
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
+	INIT_LIST_HEAD(&p->iso_list);
 	spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2270,6 +2340,29 @@ static void time_slice_expired(task_t *p
 	enqueue_task(p, rq);
 }
 
+static inline void inc_iso_ticks(runqueue_t *rq, task_t *p)
+{
+	if (rq->iso_ticks < (iso_period * HZ * 100 - 99))
+		rq->iso_ticks += 100;
+	spin_lock(&rq->lock);
+	if (!rq->iso_refractory && (rq->iso_ticks /
+		((iso_period * HZ) + 1) > iso_cpu))
+			rq->iso_refractory = 1;
+	spin_unlock(&rq->lock);
+}
+
+static inline void dec_iso_ticks(runqueue_t *rq, task_t *p)
+{
+	if (rq->iso_ticks)
+		rq->iso_ticks = rq->iso_ticks * (iso_period * HZ - 1) /
+			(iso_period * HZ);
+	spin_lock(&rq->lock);
+	if (rq->iso_refractory && (rq->iso_ticks /
+		((iso_period * HZ) + 1) < (iso_cpu * 9 / 10)))
+			rq->iso_refractory = 0;
+	spin_unlock(&rq->lock);
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -2283,6 +2376,17 @@ void scheduler_tick(void)
 
 	rq->timestamp_last_tick = sched_clock();
 
+	/*
+	 * The iso_ticks accounting is incremented only when a SCHED_ISO task
+	 * is running in soft rt mode. Running rt_tasks are also accounted
+	 * to make the iso_cpu a proportion of cpu available for SCHED_NORMAL
+	 * tasks only.
+	 */
+	if (rt_task(p) || (iso_task(p) && !rq->iso_refractory))
+		inc_iso_ticks(rq, p);
+	else
+		dec_iso_ticks(rq, p);
+
 	if (p == rq->idle) {
 		if (wake_priority_sleeper(rq))
 			goto out;
@@ -2307,6 +2411,23 @@ void scheduler_tick(void)
 	if (p->ns_debit < NSJIFFY)
 		goto out_unlock;
 	p->ns_debit %= NSJIFFY;
+
+	if (iso_task(p)) {
+		if (rq->iso_refractory)
+			/*
+			 * If we are in the refractory period for SCHED_ISO
+			 * tasks we schedule them as SCHED_NORMAL.
+			 */
+			goto sched_normal;
+		if (p->policy == SCHED_ISO_RR && !--p->slice) {
+			p->slice = slice(p);
+			set_tsk_need_resched(p);
+			requeue_iso_task(p, rq);
+		}
+		goto out_unlock;
+	}
+
+sched_normal:
 	/*
 	 * Tasks lose burst each time they use up a full slice().
 	 */
@@ -2334,6 +2455,28 @@ out:
 	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
+static inline int iso_ready(runqueue_t *rq)
+{
+	if (rq->iso_running && !rq->iso_refractory)
+		return 1;
+	return 0;
+}
+
+/* Find the next SCHED_ISO task */
+static task_t* find_iso(runqueue_t *rq)
+{
+	int idx = find_first_bit(rq->iso_bitmap, MAX_USER_RT_PRIO);
+	return list_entry(rq->iso_queue[idx].next, task_t, iso_list);
+}
+
+static inline task_t* find_next_task(runqueue_t *rq)
+{
+	int idx = sched_find_first_bit(rq->bitmap);
+	if (unlikely(iso_ready(rq) && idx > ISO_PRIO))
+		return find_iso(rq);
+	return list_entry(rq->queue[idx].next, task_t, run_list);
+}
+
 #ifdef CONFIG_SCHED_SMT
 static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
@@ -2407,8 +2550,7 @@ static int dependent_sleeper(int this_cp
 	if (!this_rq->nr_running)
 		goto out_unlock;
 
-	p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next,
-		task_t, run_list);
+	p = find_next_task(this_rq);
 
 	for_each_cpu_mask(i, sibling_map) {
 		runqueue_t *smt_rq = cpu_rq(i);
@@ -2424,7 +2566,7 @@ static int dependent_sleeper(int this_cp
 		 */
 		if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) >
 			slice(p) || rt_task(smt_curr) || batch_task(p)) &&
-			p->mm && smt_curr->mm && !rt_task(p) &&
+			p->mm && smt_curr->mm && !rt_task(p) && !iso_task(p) &&
 			!batch_task(smt_curr))
 				ret = 1;
 
@@ -2434,9 +2576,9 @@ static int dependent_sleeper(int this_cp
 		 * reasons.
 		 */
 		if ((((p->slice * (100 - sd->per_cpu_gain) / 100) > 
-			slice(smt_curr) || rt_task(p) || batch_task(smt_curr)) && 
-			smt_curr->mm && p->mm && !rt_task(smt_curr) &&
-			!batch_task(p)) ||
+			slice(smt_curr) || rt_task(p) || iso_task(p) ||
+			batch_task(smt_curr)) && smt_curr->mm && p->mm &&
+			!rt_task(smt_curr) && !batch_task(p)) ||
 			(smt_curr == smt_rq->idle && smt_rq->nr_running))
 				resched_task(smt_curr);
 	}
@@ -2496,10 +2638,9 @@ asmlinkage void __sched schedule(void)
 	long *switch_count;
 	task_t *prev, *next;
 	runqueue_t *rq;
-	struct list_head *queue;
 	unsigned long long now;
 	unsigned long debit;
-	int cpu, idx;
+	int cpu;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2587,9 +2728,7 @@ go_idle:
 			goto go_idle;
 	}
 
-	idx = sched_find_first_bit(rq->bitmap);
-	queue = rq->queue + idx;
-	next = list_entry(queue->next, task_t, run_list);
+	next = find_next_task(rq);
 
 switch_tasks:
 	if (next == rq->idle)
@@ -3089,6 +3228,10 @@ asmlinkage long sys_nice(int increment)
  */
 int task_prio(const task_t *p)
 {
+	if (iso_task(p))
+		return -(p->rt_priority);
+	if (rt_task(p))
+		return -(MAX_RT_PRIO + p->rt_priority);
 	return p->prio - MAX_RT_PRIO;
 }
 
@@ -3166,22 +3309,34 @@ int sched_setscheduler(struct task_struc
 	runqueue_t *rq;
 
 recheck:
+	if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) {
+		/*
+		 * If the caller requested an RT policy without having the
+		 * necessary rights, we downgrade the policy to the 
+		 * SCHED_ISO equivalent.
+		 */
+		if ((policy) == SCHED_RR)
+			policy = SCHED_ISO_RR;
+		else
+			policy = SCHED_ISO_FIFO;
+		}
+
 	/* double check policy once rq lock held */
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (!SCHED_RANGE(policy))
 			return -EINVAL;
 	/*
-	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * Valid priorities for SCHED_FIFO, SCHED_RR and SCHED_ISO are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    param->sched_priority > MAX_USER_RT_PRIO-1)
 		return -EINVAL;
-	if ((!SCHED_RT(policy)) != (param->sched_priority == 0))
-		return -EINVAL;
+	if ((!SCHED_RT_PRIO(policy)) != (param->sched_priority == 0))
+ 		return -EINVAL;
 
-	if (SCHED_RT(policy)) && !capable(CAP_SYS_NICE))
+	if (SCHED_RT(policy) && !capable(CAP_SYS_NICE))
 		return -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 	    !capable(CAP_SYS_NICE))
@@ -3631,6 +3786,8 @@ asmlinkage long sys_sched_get_priority_m
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
+	case SCHED_ISO_RR:
+	case SCHED_ISO_FIFO:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
@@ -3655,6 +3812,8 @@ asmlinkage long sys_sched_get_priority_m
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
+	case SCHED_ISO_RR:
+	case SCHED_ISO_FIFO:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
@@ -4709,6 +4868,7 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		rq->cache_ticks = 0;
 		rq->preempted = 0;
+		rq->iso_refractory = rq->iso_ticks = rq->iso_running = 0;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_dummy;
@@ -4727,6 +4887,12 @@ void __init sched_init(void)
 		 * delimiter for bitsearch
 		 */
 		__set_bit(MAX_PRIO, rq->bitmap);
+
+		for (j = 0; j < MAX_USER_RT_PRIO; j++) {
+			INIT_LIST_HEAD(rq->iso_queue + j);
+			__clear_bit(j, rq->iso_bitmap);
+		}
+		__set_bit(MAX_USER_RT_PRIO, rq->iso_bitmap);
 	}
 
 	/*
@@ -4776,7 +4942,7 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	for_each_process (p) {
-		if (!rt_task(p))
+		if (!rt_task(p) && !iso_task(p))
 			continue;
 
 		rq = task_rq_lock(p, &flags);
Index: linux-2.6.11-rc2-ck1/kernel/sysctl.c
===================================================================
--- linux-2.6.11-rc2-ck1.orig/kernel/sysctl.c	2005-01-27 10:12:49.027820240 +1100
+++ linux-2.6.11-rc2-ck1/kernel/sysctl.c	2005-01-27 10:46:53.262820122 +1100
@@ -219,6 +219,11 @@ static ctl_table root_table[] = {
 	{ .ctl_name = 0 }
 };
 
+/* Constants for minimum and maximum testing in vm_table.
+   We use these as one-element integer vectors. */
+static int zero;
+static int one_hundred = 100;
+
 static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_OSTYPE,
@@ -649,15 +654,28 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= KERN_ISO_CPU,
+		.procname	= "iso_cpu",
+		.data		= &iso_cpu,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.ctl_name	= KERN_ISO_PERIOD,
+		.procname	= "iso_period",
+		.data		= &iso_period,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,

