Index: linux-2.6.11-rc1-ck2/include/linux/init_task.h
===================================================================
--- linux-2.6.11-rc1-ck2.orig/include/linux/init_task.h	2005-01-19 10:34:32.106673064 +1100
+++ linux-2.6.11-rc1-ck2/include/linux/init_task.h	2005-01-19 10:36:41.805955776 +1100
@@ -80,6 +80,7 @@
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.iso_list	= LIST_HEAD_INIT(tsk.iso_list),			\
 	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
Index: linux-2.6.11-rc1-ck2/include/linux/sched.h
===================================================================
--- linux-2.6.11-rc1-ck2.orig/include/linux/sched.h	2005-01-19 10:34:32.485615456 +1100
+++ linux-2.6.11-rc1-ck2/include/linux/sched.h	2005-01-19 13:42:49.157260328 +1100
@@ -131,15 +131,18 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+#define SCHED_ISO		4
 
 #define SCHED_MIN		0
-#define SCHED_MAX		3
+#define SCHED_MAX		4
 
 #define SCHED_RANGE(policy)	((policy) >= SCHED_MIN && \
 					(policy) <= SCHED_MAX)
 #define SCHED_RT(policy)	((policy) == SCHED_FIFO || \
 					(policy) == SCHED_RR)
 					
+extern int iso_cpu, iso_period;
+
 struct sched_param {
 	int sched_priority;
 };
@@ -370,6 +373,7 @@
 
 #define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
 #define batch_task(p)		((p)->policy == SCHED_BATCH)
+#define iso_task(p)		((p)->policy == SCHED_ISO)
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -546,6 +550,7 @@
 
 	int prio, static_prio;
 	struct list_head run_list;
+	struct list_head iso_list;
 
 	unsigned long long timestamp;
 	unsigned long runtime, totalrun, ns_debit;
Index: linux-2.6.11-rc1-ck2/include/linux/sysctl.h
===================================================================
--- linux-2.6.11-rc1-ck2.orig/include/linux/sysctl.h	2005-01-19 10:34:32.426624424 +1100
+++ linux-2.6.11-rc1-ck2/include/linux/sysctl.h	2005-01-19 10:38:51.790195168 +1100
@@ -137,6 +137,8 @@
 	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
 	KERN_INTERACTIVE=68,	/* interactive tasks can have cpu bursts */
 	KERN_COMPUTE=69,	/* adjust timeslices for a compute server */
+	KERN_ISO_CPU=70,	/* int: cpu% allowed by SCHED_ISO class */
+	KERN_ISO_PERIOD=71,	/* int: seconds over which SCHED_ISO cpu is decayed */
 };
 
 
Index: linux-2.6.11-rc1-ck2/kernel/sched.c
===================================================================
--- linux-2.6.11-rc1-ck2.orig/kernel/sched.c	2005-01-19 10:34:32.484615608 +1100
+++ linux-2.6.11-rc1-ck2/kernel/sched.c	2005-01-19 13:40:59.782887760 +1100
@@ -84,10 +84,12 @@
 #define _RR_INTERVAL		((10 * HZ / 1000) ? : 1)
 #define RR_INTERVAL()		(_RR_INTERVAL * (1 + 9 * sched_compute))
 
+int iso_cpu = 70;	/* The soft %cpu limit on SCHED_ISO tasks */
+int iso_period = 5;	/* The time over which SCHED_ISO cpu decays */
+
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp)	\
 				< (long long) (sd)->cache_hot_time)
 
-
 /*
  * These are the runqueue data structures:
  */
@@ -112,6 +114,16 @@
 #ifdef CONFIG_SMP
 	unsigned long cpu_load;
 #endif
+	unsigned long iso_ticks;
+	struct list_head iso_queue;
+	int iso_refractory;
+	/* 
+	 * Refractory is the flag that we've hit the maximum iso cpu and are
+	 * in the refractory period where SCHED_ISO tasks can only run as
+	 * SCHED_NORMAL until their cpu usage drops to 90% of their iso_cpu
+	 * limit.
+	 */
+
 	unsigned long long nr_switches;
 
 	/*
@@ -490,23 +502,54 @@
 	return !list_empty(&task->run_list);
 }
 
+static inline int iso_queued(runqueue_t *rq)
+{
+	return !list_empty(&rq->iso_queue);
+}
+
+static inline void dequeue_iso_task(struct task_struct *p)
+{
+	list_del_init(&p->iso_list);
+}
+
 /*
  * Adding/removing a task to/from a runqueue:
  */
 static void dequeue_task(struct task_struct *p, runqueue_t *rq)
 {
+	if (iso_task(p))
+		dequeue_iso_task(p);
 	list_del_init(&p->run_list);
 	if (list_empty(rq->queue + p->prio))
 		__clear_bit(p->prio, rq->bitmap);
 	p->ns_debit = 0;
 }
 
+/*
+ * SCHED_ISO tasks are queued at both runqueues. Their actual priority is
+ * either better than SCHED_NORMAL if below starvation limits, or
+ * the underlying SCHED_NORMAL dynamic priority.
+ */
+static void enqueue_iso_task(struct task_struct *p)
+{
+	runqueue_t *rq = task_rq(p);
+	list_add_tail(&p->iso_list, &rq->iso_queue);
+}
+
 static void enqueue_task(struct task_struct *p, runqueue_t *rq)
 {
 	list_add_tail(&p->run_list, rq->queue + p->prio);
 	__set_bit(p->prio, rq->bitmap);
+	if (iso_task(p))
+		enqueue_iso_task(p);
 }
 
+static void requeue_iso_task(struct task_struct *p)
+{
+	runqueue_t *rq = task_rq(p);
+	list_move_tail(&p->iso_list, &rq->iso_queue);
+ }
+ 
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
@@ -514,6 +557,8 @@
 static void requeue_task(struct task_struct *p, runqueue_t *rq)
 {
 	list_move_tail(&p->run_list, rq->queue + p->prio);
+	if (iso_task(p))
+		requeue_iso_task(p);
 }
 
 static void enqueue_task_head(struct task_struct *p, runqueue_t *rq)
@@ -947,16 +992,27 @@
  */
 static void preempt(task_t *p, runqueue_t *rq)
 {
-	if (p->prio > rq->curr->prio)
-		return;
-	if (p->prio == rq->curr->prio &&
-		((p->totalrun || p->slice != slice(p)) ||
-		rt_task(rq->curr)))
-			return;
+	if ((!iso_task(p) && !iso_task(rq->curr)) || rq->iso_refractory ||
+		rt_task(p) || rt_task(rq->curr)) {
+			if (p->prio < rq->curr->prio)
+				goto resched_out;
+			if (p->prio == rq->curr->prio && (!rt_task(rq->curr) &&
+				!p->totalrun && p->slice == slice(p)))
+					goto resched_out;
+			goto out;
+	}
+	if (iso_task(p) && !iso_task(rq->curr))
+		goto resched_out;
+
+resched_out:
 	if (!sched_compute || rq->cache_ticks >= cache_delay ||
-		!p->mm || rt_task(p))
+		!p->mm || rt_task(p) || iso_task(p)) {
 			resched_task(rq->curr);
+			return;
+	}
 	rq->preempted = 1;
+out:
+	return;
 }
 
 /***
@@ -1136,6 +1192,7 @@
 	 */
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
+	INIT_LIST_HEAD(&p->iso_list);
 	spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2266,6 +2323,39 @@
 	enqueue_task(p, rq);
 }
 
+static inline int inc_iso_ticks(runqueue_t *rq, task_t *p)
+{
+	int ret = 0;
+	if (rq->iso_ticks < (iso_period * HZ * 100 - 99))
+		rq->iso_ticks += 100;
+	spin_lock(&rq->lock);
+	if (!rq->iso_refractory && (rq->iso_ticks /
+		((iso_period * HZ) + 1) > iso_cpu)) {
+			rq->iso_refractory = 1;
+			if (iso_queued(rq))
+				ret = 1;
+		}
+	spin_unlock(&rq->lock);
+	return ret;
+}
+
+static inline int dec_iso_ticks(runqueue_t *rq, task_t *p)
+{
+	int ret = 0;
+	if (rq->iso_ticks) 
+		rq->iso_ticks = rq->iso_ticks * (iso_period * HZ - 1) /
+			(iso_period * HZ);
+	spin_lock(&rq->lock);
+	if (rq->iso_refractory && (rq->iso_ticks /
+		((iso_period * HZ) + 1) < (iso_cpu * 9 / 10))) {
+			rq->iso_refractory = 0;
+			if (iso_queued(rq)) 
+				ret = 1;
+	}
+	spin_unlock(&rq->lock);
+	return ret;
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -2276,8 +2366,14 @@
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 	unsigned long debit;
+	int iso_change;
 
 	rq->timestamp_last_tick = sched_clock();
+	if (iso_task(p) && !rq->iso_refractory)
+		iso_change = inc_iso_ticks(rq, p);
+	else 
+		iso_change = dec_iso_ticks(rq, p);
+ 
 
 	if (p == rq->idle) {
 		if (wake_priority_sleeper(rq))
@@ -2303,6 +2399,34 @@
 	if (p->ns_debit < NSJIFFY)
 		goto out_unlock;
 	p->ns_debit %= NSJIFFY;
+	if (unlikely(iso_change)) {
+		/*
+		 * A SCHED_ISO task was running and the soft cpu limit
+		 * was hit, or SCHED_ISO task(s) are waiting and the 
+		 * refractory period has ended. Reschedule to start ISO
+		 * tasks as SCHED_NORMAL in the former case and to allow
+		 * SCHED_ISO tasks to preempt in the latter.
+		 */
+		time_slice_expired(p, rq);
+		goto out_unlock;
+	}
+
+	if (iso_task(p) && !rq->iso_refractory) {
+		if (!--p->slice) {
+			p->slice = slice(p);
+			p->time_slice = rr_interval(p);
+			set_tsk_need_resched(p);
+
+			/* put it at the end of the queue: */
+			requeue_task(p, rq);
+		} else if (!--p->time_slice) {
+			p->time_slice = rr_interval(p);
+			requeue_task(p, rq);
+			set_tsk_need_resched(p);
+		}
+		goto out_unlock;
+	}
+
 	/*
 	 * Tasks lose burst each time they use up a full slice().
 	 */
@@ -2330,6 +2454,29 @@
 	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
+static inline int iso_ready(runqueue_t *rq)
+{
+	if (iso_queued(rq) && !rq->iso_refractory)
+		return 1;
+	return 0;
+}
+
+/*
+ * When a SCHED_ISO task is ready to be scheduled, we re-queue it with an
+ * effective prio of MAX_RT_PRIO for userspace to know its relative prio.
+ */
+static task_t* queue_iso(runqueue_t *rq)
+{
+	task_t *p = list_entry(rq->iso_queue.next, task_t, iso_list);
+	list_del(&p->run_list);
+	if (list_empty(rq->queue + p->prio))
+		__clear_bit(p->prio, rq->bitmap);
+	p->prio = MAX_RT_PRIO;
+	list_add_tail(&p->run_list, rq->queue + p->prio);
+	__set_bit(p->prio, rq->bitmap);
+	return p;
+}
+
 #ifdef CONFIG_SCHED_SMT
 static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
@@ -2380,7 +2527,7 @@
 {
 	struct sched_domain *sd = this_rq->sd;
 	cpumask_t sibling_map;
-	int ret = 0, i;
+	int ret = 0, i, idx;
 	task_t *p;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
@@ -2405,6 +2552,11 @@
 
 	p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next,
 		task_t, run_list);
+	idx = sched_find_first_bit(this_rq->bitmap);
+	if (unlikely(iso_ready(this_rq) && idx >= MAX_RT_PRIO))
+		p = queue_iso(this_rq);
+	else
+		p = list_entry(this_rq->queue[idx].next, task_t, run_list);
 
 	for_each_cpu_mask(i, sibling_map) {
 		runqueue_t *smt_rq = cpu_rq(i);
@@ -2420,7 +2572,7 @@
 		 */
 		if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) >
 			slice(p) || rt_task(smt_curr) || batch_task(p)) &&
-			p->mm && smt_curr->mm && !rt_task(p) &&
+			p->mm && smt_curr->mm && !rt_task(p) && !iso_task(p) &&
 			!batch_task(smt_curr))
 				ret = 1;
 
@@ -2432,7 +2584,7 @@
 		if ((((p->slice * (100 - sd->per_cpu_gain) / 100) > 
 			slice(smt_curr) || rt_task(p) || batch_task(smt_curr)) && 
 			smt_curr->mm && p->mm && !rt_task(smt_curr) &&
-			!batch_task(p)) ||
+			!iso_task(smt_curr) && !batch_task(p)) ||
 			(smt_curr == smt_rq->idle && smt_rq->nr_running))
 				resched_task(smt_curr);
 	}
@@ -2584,8 +2736,12 @@
 	}
 
 	idx = sched_find_first_bit(rq->bitmap);
-	queue = rq->queue + idx;
-	next = list_entry(queue->next, task_t, run_list);
+	if (unlikely(iso_ready(rq) && idx >= MAX_RT_PRIO))
+		next = queue_iso(rq);
+	else {
+		queue = rq->queue + idx;
+		next = list_entry(queue->next, task_t, run_list);
+	}
 
 switch_tasks:
 	if (next == rq->idle)
@@ -3064,8 +3220,13 @@
 	if ((!SCHED_RT(policy)) != (param->sched_priority == 0))
 		return -EINVAL;
 
-	if (SCHED_RT(policy)) && !capable(CAP_SYS_NICE))
-		return -EPERM;
+	if (SCHED_RT(policy) && !capable(CAP_SYS_NICE))
+		/*
+		 * If the caller requested an RT policy without having the
+		 * necessary rights, we downgrade the policy to SCHED_ISO.
+		 * Temporary hack for testing.
+		 */
+		policy = SCHED_ISO;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 	    !capable(CAP_SYS_NICE))
 		return -EPERM;
@@ -3517,6 +3678,7 @@
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
+	case SCHED_ISO:
 	case SCHED_BATCH:
 		ret = 0;
 		break;
@@ -3542,6 +3704,7 @@
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_ISO:
 		ret = 0;
 	}
 	return ret;
@@ -4592,6 +4755,7 @@
 		spin_lock_init(&rq->lock);
 		rq->cache_ticks = 0;
 		rq->preempted = 0;
+		rq->iso_refractory = rq->iso_ticks = 0;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_dummy;
@@ -4610,6 +4774,7 @@
 		 * delimiter for bitsearch
 		 */
 		__set_bit(MAX_PRIO, rq->bitmap);
+		INIT_LIST_HEAD(&rq->iso_queue);
 	}
 
 	/*
@@ -4659,7 +4824,7 @@
 
 	read_lock_irq(&tasklist_lock);
 	for_each_process (p) {
-		if (!rt_task(p))
+		if (!rt_task(p) && !iso_task(p))
 			continue;
 
 		rq = task_rq_lock(p, &flags);
Index: linux-2.6.11-rc1-ck2/kernel/sysctl.c
===================================================================
--- linux-2.6.11-rc1-ck2.orig/kernel/sysctl.c	2005-01-19 10:34:32.425624576 +1100
+++ linux-2.6.11-rc1-ck2/kernel/sysctl.c	2005-01-19 10:40:20.790665040 +1100
@@ -219,6 +219,11 @@
 	{ .ctl_name = 0 }
 };
 
+/* Constants for minimum and maximum testing in vm_table.
+   We use these as one-element integer vectors. */
+static int zero;
+static int one_hundred = 100;
+
 static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_OSTYPE,
@@ -649,15 +654,28 @@
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= KERN_ISO_CPU,
+		.procname	= "iso_cpu",
+		.data		= &iso_cpu,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.ctl_name	= KERN_ISO_PERIOD,
+		.procname	= "iso_period",
+		.data		= &iso_period,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,

