Add the SCHED_ISO policy (isochronous) which is a starvation free soft
realtime policy available to unprivileged users. The amount of cpu that
SCHED_ISO tasks will run as realtime is configurable by the tunable in

/proc/sys/kernel/iso_cpu

and is set to 80% by default.

The duration over which its cpu usage is averaged is controlled by the
tunable

/proc/sys/kernel/iso_period

and is set to 5 (seconds) by default.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

 Documentation/sysctl/kernel.txt |   21 ++++++
 include/linux/sched.h           |    8 +-
 kernel/sched.c                  |  132 +++++++++++++++++++++++++++++++++++-----
 kernel/sysctl.c                 |   24 +++++++
 4 files changed, 169 insertions(+), 16 deletions(-)

Index: linux-2.6.22-rc4-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.22-rc4-ck1.orig/include/linux/sched.h	2007-06-10 21:59:53.000000000 +1000
+++ linux-2.6.22-rc4-ck1/include/linux/sched.h	2007-06-10 21:59:53.000000000 +1000
@@ -34,10 +34,11 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+#define SCHED_ISO		4
 
 #ifdef __KERNEL__
 
-#define SCHED_MAX		SCHED_BATCH
+#define SCHED_MAX		SCHED_ISO
 #define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
 
 struct sched_param {
@@ -540,15 +541,17 @@ struct signal_struct {
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 #define PRIO_RANGE		(40)
+#define ISO_PRIO		(MAX_RT_PRIO - 1)
 
 #define MAX_PRIO		(MAX_RT_PRIO + PRIO_RANGE)
 
-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_prio(prio)		unlikely((prio) < ISO_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
 #define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
 					(policy) == SCHED_RR)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+#define iso_task(p)		unlikely((p)->policy == SCHED_ISO)
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -1190,6 +1193,7 @@ static inline void put_task_struct(struc
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_ISOREF	0x04000000	/* SCHED_ISO task has used up quota */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
Index: linux-2.6.22-rc4-ck1/kernel/sched.c
===================================================================
--- linux-2.6.22-rc4-ck1.orig/kernel/sched.c	2007-06-10 21:59:53.000000000 +1000
+++ linux-2.6.22-rc4-ck1/kernel/sched.c	2007-06-10 21:59:53.000000000 +1000
@@ -105,6 +105,18 @@ int rr_interval __read_mostly = 10;
 int sched_interactive __read_mostly = 1;
 
 /*
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
+ * sched_iso_period - sysctl which determines the number of seconds over
+ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are
+ * exceeding their allowable bandwidth.
+*/
+int sched_iso_cpu __read_mostly = 80;
+int sched_iso_period __read_mostly = 5;
+
+#define ISO_PERIOD	((sched_iso_period * HZ) + 1)
+
+/*
  * This contains a bitmap for each dynamic priority level with empty slots
  * for the valid priorities each different nice level can have. It allows
  * us to stagger the slots where differing priorities run in a way that
@@ -205,6 +217,8 @@ struct rq {
 
 	/* How many times we have rotated the priority queue */
 	unsigned long prio_rotation;
+	unsigned long iso_ticks;
+	unsigned short iso_refractory;
 
 	atomic_t nr_iowait;
 
@@ -796,6 +810,11 @@ static inline void update_if_moved(struc
 }
 #endif
 
+static inline int isoprio_suitable(struct task_struct *p)
+{
+	return !(p->flags & PF_ISOREF);
+}
+
 /*
  * recalc_task_prio determines what priority a non rt_task will be
  * queued at. If the task has already been running during this runqueue's
@@ -812,6 +831,25 @@ static void recalc_task_prio(struct task
 	struct prio_array *array = rq->active;
 	int queue_prio;
 
+	if (iso_task(p)) {
+		if (isoprio_suitable(p)) {
+			/*
+			 * If SCHED_ISO tasks have not used up their real time
+			 * quota they have run just better than highest
+			 * SCHED_NORMAL priority. Otherwise they run as
+			 * SCHED_NORMAL.
+			 */
+			p->prio = p->normal_prio = ISO_PRIO;
+			p->array = rq->active;
+			if (p->time_slice <= 0)
+				p->time_slice = p->quota;
+			return;
+		} else if (p->prio == ISO_PRIO) {
+			/* Just about to be demoted to SCHED_NORMAL */
+			p->time_slice = 0;
+		}
+	}
+
 	update_if_moved(p, rq);
 	if (p->rotation == rq->prio_rotation) {
 		if (p->array == array) {
@@ -3407,18 +3445,65 @@ static void task_expired_entitlement(str
 	p->time_slice += overrun;
 }
 
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag.
+ */
+static unsigned int test_ret_isorefractory(struct rq *rq)
+{
+	if (likely(!rq->iso_refractory)) {
+		if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
+			rq->iso_refractory = 1;
+	} else {
+		if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
+			rq->iso_refractory = 0;
+	}
+	return rq->iso_refractory;
+}
+
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
+static inline void no_iso_tick(struct rq *rq)
+{
+	rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+}
+
 /* This manages tasks that have run out of timeslice during a scheduler_tick */
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
+	/*
+	 * If a SCHED_ISO task is running we increment the iso_ticks. In
+	 * order to prevent SCHED_ISO tasks from causing starvation in the
+	 * presence of true RT tasks we account those as iso_ticks as well.
+	 */
+	if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) {
+		if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
+			rq->iso_ticks += 100;
+	} else
+		no_iso_tick(rq);
+
+	if (iso_task(p)) {
+		if (unlikely(test_ret_isorefractory(rq))) {
+			if (isoprio_suitable(p)) {
+				/*
+				 * SCHED_ISO task is running as RT and limit
+				 * has been hit. Set the PF_ISOREF flag and
+				 * force it to reschedule as SCHED_NORMAL
+				 * by zeroing its time_slice
+				 */
+				p->flags |= PF_ISOREF;
+				p->time_slice = 0;
+			}
+		} else
+			p->flags &= ~PF_ISOREF;
+	}
 	/* SCHED_FIFO tasks never run out of timeslice. */
 	if (p->time_slice > 0 || p->policy == SCHED_FIFO)
 		return;
 	/* p->time_slice <= 0 */
-	spin_lock(&rq->lock);
+	set_tsk_need_resched(p);
 	if (likely(task_queued(p)))
 		task_expired_entitlement(rq, p);
-	set_tsk_need_resched(p);
-	spin_unlock(&rq->lock);
 }
 
 /*
@@ -3435,8 +3520,12 @@ void scheduler_tick(void)
 
 	update_cpu_clock(p, rq, now, 1);
 
+	spin_lock(&rq->lock);
 	if (!idle_at_tick)
 		task_running_tick(rq, p);
+	else
+		no_iso_tick(rq);
+	spin_unlock(&rq->lock);
 #ifdef CONFIG_SMP
 	update_load(rq);
 	rq->idle_at_tick = idle_at_tick;
@@ -3513,7 +3602,8 @@ retry:
 	}
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
-	if (unlikely(next->time_slice <= 0)) {
+	if (unlikely(next->time_slice <= 0 && !(iso_task(next) &&
+	    isoprio_suitable(next)))) {
 		/*
 		 * Unlucky enough that this task ran out of time_slice
 		 * before it hit a scheduler_tick so it should have its
@@ -3605,7 +3695,7 @@ need_resched_nonpreemptible:
 	}
 
 	idx = sched_find_first_bit(rq->dyn_bitmap);
-	if (!rt_prio(idx))
+	if (likely(idx > ISO_PRIO))
 		next = next_dynamic_task(rq, idx);
 	else {
 		queue = rq->active->queue + idx;
@@ -4282,12 +4372,31 @@ static void __setscheduler(struct task_s
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
+	struct sched_param zero_param = { .sched_priority = 0 };
 	int queued, retval, oldprio, oldpolicy = -1;
+	unsigned long rlim_rtprio = 0;
 	unsigned long flags;
 	struct rq *rq;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
+	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+		unsigned long lflags;
+
+		if (!lock_task_sighand(p, &lflags))
+			return -ESRCH;
+		rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+		unlock_task_sighand(p, &lflags);
+		if (rlim_rtprio)
+			goto recheck;
+		/*
+		 * If the caller requested an RT policy without having the
+		 * necessary rights, we downgrade the policy to SCHED_ISO.
+		 * We also set the parameter to zero to pass the checks.
+		 */
+		policy = SCHED_ISO;
+		param = &zero_param;
+	}
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
@@ -4311,14 +4420,6 @@ recheck:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		if (is_rt_policy(policy)) {
-			unsigned long rlim_rtprio;
-			unsigned long flags;
-
-			if (!lock_task_sighand(p, &flags))
-				return -ESRCH;
-			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
-			unlock_task_sighand(p, &flags);
-
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
@@ -4815,6 +4916,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_ISO:
 		ret = 0;
 		break;
 	}
@@ -4839,6 +4941,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_ISO:
 		ret = 0;
 	}
 	return ret;
@@ -6971,6 +7074,7 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
+		rq->iso_ticks = 0;
 		rq->nr_running = 0;
 		rq->prio_rotation = 0;
 		rq->active = rq->arrays;
@@ -7065,7 +7169,7 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	for_each_process(p) {
-		if (!rt_task(p))
+		if (!rt_task(p) && !iso_task(p))
 			continue;
 
 		spin_lock_irqsave(&p->pi_lock, flags);
Index: linux-2.6.22-rc4-ck1/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.22-rc4-ck1.orig/Documentation/sysctl/kernel.txt	2007-06-10 21:59:53.000000000 +1000
+++ linux-2.6.22-rc4-ck1/Documentation/sysctl/kernel.txt	2007-06-10 21:59:53.000000000 +1000
@@ -26,6 +26,8 @@ show up in /proc/sys/kernel:
 - hostname
 - hotplug
 - interactive
+- iso_cpu
+- iso_period
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
 - kstack_depth_to_print       [ X86 only ]
@@ -181,6 +183,25 @@ Default value is 1 (enabled).
 
 ==============================================================
 
+iso_cpu:
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling iso_period
+seconds.
+
+Set to 80 (percent) by default.
+
+==============================================================
+
+iso_period:
+
+This sets the number of seconds over which SCHED_ISO cpu usage is averaged
+to see if it exceeds its allocated cpu bandwidth.
+
+Set to 5 (seconds) by default.
+
+==============================================================
+
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If
Index: linux-2.6.22-rc4-ck1/kernel/sysctl.c
===================================================================
--- linux-2.6.22-rc4-ck1.orig/kernel/sysctl.c	2007-06-10 21:59:53.000000000 +1000
+++ linux-2.6.22-rc4-ck1/kernel/sysctl.c	2007-06-10 21:59:53.000000000 +1000
@@ -80,6 +80,8 @@ extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int rr_interval;
 extern int sched_interactive;
+extern int sched_iso_cpu;
+extern int sched_iso_period;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -530,6 +532,28 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "iso_cpu",
+		.data		= &sched_iso_cpu,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "iso_period",
+		.data		= &sched_iso_period,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &one_hundred,
+	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
