Index: linux-2.6.15-rc1-ck1/include/linux/sched.h =================================================================== --- linux-2.6.15-rc1-ck1.orig/include/linux/sched.h 2005-11-12 20:53:15.000000000 +1100 +++ linux-2.6.15-rc1-ck1/include/linux/sched.h 2005-11-12 20:54:41.000000000 +1100 @@ -159,9 +159,10 @@ extern unsigned long nr_iowait(void); #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#define SCHED_ISO 4 #define SCHED_MIN 0 -#define SCHED_MAX 3 +#define SCHED_MAX 4 #define SCHED_RANGE(policy) ((policy) >= SCHED_MIN && \ (policy) <= SCHED_MAX) @@ -205,7 +206,7 @@ extern void show_stack(struct task_struc void io_schedule(void); long io_schedule_timeout(long timeout); -extern int sched_interactive, sched_compute; +extern int sched_interactive, sched_compute, sched_iso_cpu; extern void cpu_init (void); extern void trap_init(void); @@ -502,6 +503,7 @@ struct signal_struct { #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) #define batch_task(p) ((p)->policy == SCHED_BATCH) +#define iso_task(p) ((p)->policy == SCHED_ISO) /* * Some day this will be a full-fledged user tracking system.. @@ -921,6 +923,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_HOTPLUG_CPU 0x01000000 /* Currently performing CPU hotplug */ #define PF_NONSLEEP 0x02000000 /* Waiting on in kernel activity */ #define PF_YIELDED 0x04000000 /* I have just yielded */ +#define PF_ISOREF 0x08000000 /* SCHED_ISO task has used up quota */ /* * Only the _current_ task can read/write to tsk->flags, but other Index: linux-2.6.15-rc1-ck1/include/linux/sysctl.h =================================================================== --- linux-2.6.15-rc1-ck1.orig/include/linux/sysctl.h 2005-11-12 15:45:42.000000000 +1100 +++ linux-2.6.15-rc1-ck1/include/linux/sysctl.h 2005-11-12 20:54:24.000000000 +1100 @@ -149,6 +149,7 @@ enum KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ KERN_INTERACTIVE=71, /* interactive tasks can have cpu bursts */ KERN_COMPUTE=72, /* adjust timeslices for a compute server */ + KERN_ISO_CPU=73, /* percent cpu SCHED_ISO tasks run SCHED_RR */ }; Index: linux-2.6.15-rc1-ck1/kernel/sched.c =================================================================== --- linux-2.6.15-rc1-ck1.orig/kernel/sched.c 2005-11-12 20:53:50.000000000 +1100 +++ linux-2.6.15-rc1-ck1/kernel/sched.c 2005-11-12 20:54:24.000000000 +1100 @@ -92,6 +92,9 @@ int sched_compute = 0; #define RR_INTERVAL() (_RR_INTERVAL * (1 + 19 * sched_compute)) #define DEF_TIMESLICE (RR_INTERVAL() * 19) +int sched_iso_cpu = 70; +#define ISO_PERIOD (3 * HZ) + #define task_hot(p, now, sd) ((long long) ((now) - (p)->timestamp) \ < (long long) (sd)->cache_hot_time) @@ -116,6 +119,8 @@ struct runqueue { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; + unsigned long iso_ticks; + unsigned int iso_refractory; #ifdef CONFIG_SMP unsigned long prio_bias; unsigned long cpu_load[3]; @@ -721,6 +726,15 @@ static int effective_prio(task_t *p) } return MAX_PRIO - 1; } + if (iso_task(p)) { + if (likely(!(p->flags & PF_ISOREF))) + /* + * If SCHED_ISO tasks have not used up their real time + * quota they have highest SCHED_NORMAL priority. + * Otherwise they run as SCHED_NORMAL. + */ + return MAX_RT_PRIO; + } full_slice = slice(p); if (full_slice > p->slice) @@ -2525,6 +2539,22 @@ static void time_slice_expired(task_t *p } /* + * Test if SCHED_ISO tasks have run longer than their alloted period as RT + * tasks and set the refractory flag if necessary. There is 10% hysteresis + * for unsetting the flag. + */ +static inline unsigned int test_ret_isorefractory(runqueue_t *rq) +{ + if (likely(!rq->iso_refractory)) { + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) + rq->iso_refractory = 1; + } else + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) + rq->iso_refractory = 0; + return rq->iso_refractory; +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ @@ -2552,15 +2582,34 @@ void scheduler_tick(void) set_tsk_need_resched(p); goto out; } + + spin_lock(&rq->lock); + if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) && + p->mm)) { + rq->iso_ticks += 100; + if (rq->iso_ticks > ISO_PERIOD * 100) + rq->iso_ticks = ISO_PERIOD * 100; + } else + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + + if (unlikely(test_ret_isorefractory(rq))) { + if (iso_task(p)) { + if (!(p->flags & PF_ISOREF)) { + set_tsk_need_resched(p); + p->flags |= PF_ISOREF; + } + } + } else if (iso_task(p)) + p->flags &= ~PF_ISOREF; + /* * SCHED_FIFO tasks never run out of timeslice. */ if (unlikely(p->policy == SCHED_FIFO)) { expired_balance = 0; - goto out; + goto out_unlock; } - spin_lock(&rq->lock); debit = ns_diff(rq->timestamp_last_tick, p->timestamp); p->ns_debit += debit; if (p->ns_debit < NSJIFFY) @@ -2698,7 +2747,7 @@ static inline int dependent_sleeper(int task_t *smt_curr = smt_rq->curr; /* Kernel threads do not participate in dependent sleeping */ - if (!p->mm || !smt_curr->mm || rt_task(p)) + if (!p->mm || !smt_curr->mm || rt_task(p) || iso_task(p)) goto check_smt_task; /* @@ -2736,7 +2785,7 @@ static inline int dependent_sleeper(int check_smt_task: if ((!smt_curr->mm && smt_curr != smt_rq->idle) || - rt_task(smt_curr)) + rt_task(smt_curr) || iso_task(smt_curr)) continue; if (!p->mm) { wakeup_busy_runqueue(smt_rq); @@ -3553,9 +3602,19 @@ int sched_setscheduler(struct task_struc { int retval; int queued, oldprio, oldpolicy = -1; + struct sched_param zero_param = { .sched_priority = 0 }; unsigned long flags; runqueue_t *rq; + if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) { + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * We also set the parameter to zero to pass the checks. + */ + policy = SCHED_ISO; + param = &zero_param; + } recheck: /* double check policy once rq lock held */ if (policy < 0) @@ -4052,6 +4111,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: ret = 0; break; } @@ -4076,6 +4136,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: ret = 0; } return ret; @@ -5443,6 +5504,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->cache_ticks = 0; rq->preempted = 0; + rq->iso_ticks = 0; #ifdef CONFIG_SMP rq->sd = NULL; Index: linux-2.6.15-rc1-ck1/kernel/sysctl.c =================================================================== --- linux-2.6.15-rc1-ck1.orig/kernel/sysctl.c 2005-11-12 15:45:42.000000000 +1100 +++ linux-2.6.15-rc1-ck1/kernel/sysctl.c 2005-11-12 20:54:24.000000000 +1100 @@ -224,6 +224,11 @@ static ctl_table root_table[] = { { .ctl_name = 0 } }; +/* Constants for minimum and maximum testing. + We use these as one-element integer vectors. */ +static int zero; +static int one_hundred = 100; + static ctl_table kern_table[] = { { .ctl_name = KERN_OSTYPE, @@ -634,6 +639,17 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = KERN_ISO_CPU, + .procname = "iso_cpu", + .data = &sched_iso_cpu, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC, @@ -675,12 +691,6 @@ static ctl_table kern_table[] = { { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY,