Index: linux-2.6.30-bfs/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.30-bfs.orig/Documentation/sysctl/kernel.txt 2009-08-30 16:07:43.319391716 +1000 +++ linux-2.6.30-bfs/Documentation/sysctl/kernel.txt 2009-08-30 16:09:55.638392596 +1000 @@ -27,6 +27,7 @@ - domainname - hostname - hotplug +- iso_cpu - java-appletviewer [ binfmt_java, obsolete ] - java-interpreter [ binfmt_java, obsolete ] - kstack_depth_to_print [ X86 only ] @@ -171,6 +172,16 @@ ============================================================== +iso_cpu: + +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can +run effectively at realtime priority, averaged over a rolling five +seconds over the -whole- system, meaning all cpus. + +Set to 70 (percent) by default. + +============================================================== + l2cr: (PPC only) This flag controls the L2 cache of G3 processor boards. If Index: linux-2.6.30-bfs/include/linux/sched.h =================================================================== --- linux-2.6.30-bfs.orig/include/linux/sched.h 2009-08-30 16:02:37.859394465 +1000 +++ linux-2.6.30-bfs/include/linux/sched.h 2009-08-30 16:26:06.827766900 +1000 @@ -36,9 +36,12 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 -/* SCHED_ISO: reserved but not implemented yet */ +#define SCHED_ISO 4 #define SCHED_IDLEPRIO 5 +#define SCHED_MAX (SCHED_IDLEPRIO) +#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) + #ifdef __KERNEL__ struct sched_param { @@ -1305,8 +1308,9 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO #define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) -#define NORMAL_PRIO MAX_RT_PRIO -#define IDLE_PRIO MAX_RT_PRIO + 1 +#define ISO_PRIO (MAX_RT_PRIO) +#define NORMAL_PRIO (MAX_RT_PRIO + 1) +#define IDLE_PRIO (MAX_RT_PRIO + 2) #define PRIO_LIMIT ((IDLE_PRIO) + 1) #define DEFAULT_PRIO (MAX_RT_PRIO + 20) Index: linux-2.6.30-bfs/kernel/sched_bfs.c =================================================================== --- linux-2.6.30-bfs.orig/kernel/sched_bfs.c 2009-08-30 16:02:37.922392080 +1000 +++ linux-2.6.30-bfs/kernel/sched_bfs.c 2009-08-30 18:22:08.848686129 +1000 @@ -77,12 +77,8 @@ (policy) == SCHED_RR) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) #define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) - -#define SCHED_RANGE(policy) ((policy) == SCHED_NORMAL || \ - (policy) == SCHED_BATCH || \ - (policy) == SCHED_IDLEPRIO || \ - (policy) == SCHED_FIFO || \ - (policy) == SCHED_RR) +#define iso_task(p) unlikely((p)->policy == SCHED_ISO) +#define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1) /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -142,6 +138,14 @@ * Tunable via /proc interface. */ int rr_interval __read_mostly = 6; + +/* + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ +int sched_iso_cpu __read_mostly = 70; + int prio_ratios[PRIO_RANGE] __read_mostly; static inline unsigned long timeslice(void) @@ -156,6 +160,8 @@ unsigned long long nr_switches; struct list_head queue[PRIO_LIMIT]; DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); + unsigned long iso_ticks; + unsigned short iso_refractory; }; static struct global_rq grq; @@ -498,6 +504,11 @@ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); } +static int isoprio_suitable(void) +{ + return !grq.iso_refractory; +} + /* * Adding to the global runqueue. Enter with grq locked. */ @@ -509,6 +520,13 @@ else p->prio = NORMAL_PRIO; } + + if (iso_task(p) && !rt_task(p)) { + if (isoprio_suitable()) + p->prio = p->normal_prio; + else + p->prio = NORMAL_PRIO; + } __set_bit(p->prio, grq.prio_bitmap); list_add_tail(&p->run_list, grq.queue + p->prio); sched_info_queued(p); @@ -522,7 +540,6 @@ sched_info_queued(p); } -/* No need to do anything, it happens on return_task */ static inline void requeue_task(struct task_struct *p) { sched_info_queued(p); @@ -571,6 +588,8 @@ return MAX_RT_PRIO - 1 - p->rt_priority; if (idleprio_task(p)) return IDLE_PRIO; + if (iso_task(p)) + return ISO_PRIO; return NORMAL_PRIO; } @@ -1661,11 +1680,63 @@ { account_idle_time(jiffies_to_cputime(ticks)); } - #endif + +/* + * Test if SCHED_ISO tasks have run longer than their alloted period as RT + * tasks and set the refractory flag if necessary. There is 10% hysteresis + * for unsetting the flag. + */ +static unsigned int test_ret_isorefractory(struct rq *rq) +{ + if (likely(!grq.iso_refractory)) { + if (grq.iso_ticks / ISO_PERIOD > sched_iso_cpu) + grq.iso_refractory = 1; + } else { + if (grq.iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) + grq.iso_refractory = 0; + } + return grq.iso_refractory; +} + +/* No SCHED_ISO task was running so decrease rq->iso_ticks */ +static inline void no_iso_tick(void) +{ + grq.iso_ticks = grq.iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; +} + +static int task_running_iso(struct task_struct *p) +{ + return p->prio == ISO_PRIO; +} + /* This manages tasks that have run out of timeslice during a scheduler_tick */ static void task_running_tick(struct rq *rq, struct task_struct *p) { + /* + * If a SCHED_ISO task is running we increment the iso_ticks. In + * order to prevent SCHED_ISO tasks from causing starvation in the + * presence of true RT tasks we account those as iso_ticks as well. + */ + if ((rt_task(p) || (iso_task(p) && !grq.iso_refractory))) { + if (grq.iso_ticks <= (ISO_PERIOD * 100) - 100) + grq.iso_ticks += 100; + } else + no_iso_tick(); + + if (iso_task(p)) { + if (unlikely(test_ret_isorefractory(rq))) { + if (task_running_iso(p)) { + /* + * SCHED_ISO task is running as RT and limit + * has been hit. Force it to reschedule as + * SCHED_NORMAL by zeroing its time_slice + */ + p->time_slice = 0; + } + } + } + /* SCHED_FIFO tasks never run out of timeslice. */ if (p->time_slice > 0 || p->policy == SCHED_FIFO) return; @@ -1693,6 +1764,8 @@ update_cpu_clock(p, rq, now, 1); if (!rq_idle(rq)) task_running_tick(rq, p); + else + no_iso_tick(); grq_unlock(); } @@ -1800,14 +1873,12 @@ goto out_take; } } - } - if (unlikely(idx < NORMAL_PRIO)) { /* More rt tasks, we couldn't take the lower prio ones */ ++idx; goto retry; } - /* No rt tasks found, find earliest deadline normal task */ + /* No rt tasks, find earliest deadline task */ edt = idle; if (unlikely(idx >= PRIO_LIMIT)) { /* All rt tasks but none suitable for this cpu */ @@ -1834,7 +1905,7 @@ } } if (edt == idle) { - if (idx == NORMAL_PRIO) { + if (idx < IDLE_PRIO) { /* Haven't checked for SCHED_IDLEPRIO tasks yet */ idx++; goto retry; @@ -2597,19 +2668,18 @@ * * This is the priority value as seen by users in /proc. * RT tasks are offset by -100. Normal tasks are centered - * around 0, value goes from 0 to +100. + * around 1, value goes from 0 to +40. */ int task_prio(const struct task_struct *p) { int delta, prio = p->prio - MAX_RT_PRIO; - if (prio < 0) + /* rt tasks and iso tasks */ + if (prio <= 0) goto out; - delta = (p->deadline - jiffies) * 100 / prio_ratios[39]; - if (delta > 100) - delta = 100; - else if (delta < 0) + delta = (p->deadline - jiffies) * 200 / prio_ratios[39]; + if (delta > 40 || delta < 0) delta = 0; prio += delta; out: @@ -2684,12 +2754,31 @@ static int __sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param, bool user) { + struct sched_param zero_param = { .sched_priority = 0 }; int queued, retval, oldprio, oldpolicy = -1; - unsigned long flags; + unsigned long flags, rlim_rtprio = 0; struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); + + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { + unsigned long lflags; + + if (!lock_task_sighand(p, &lflags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &lflags); + if (rlim_rtprio) + goto recheck; + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * We also set the parameter to zero to pass the checks. + */ + policy = SCHED_ISO; + param = &zero_param; + } recheck: /* double check policy once rq lock held */ if (policy < 0) @@ -2713,14 +2802,6 @@ */ if (user && !capable(CAP_SYS_NICE)) { if (is_rt_policy(policy)) { - unsigned long rlim_rtprio; - unsigned long flags; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; - unlock_task_sighand(p, &flags); - /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) return -EPERM; @@ -3249,6 +3330,7 @@ break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: case SCHED_IDLEPRIO: ret = 0; break; @@ -3274,6 +3356,7 @@ break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: case SCHED_IDLEPRIO: ret = 0; } @@ -5441,7 +5524,7 @@ atomic_set(&rq->nr_iowait, 0); highest_cpu = i; } - grq.nr_running = grq.nr_uninterruptible = 0; + grq.iso_ticks = grq.nr_running = grq.nr_uninterruptible = 0; for (i = 0; i < PRIO_LIMIT; i++) INIT_LIST_HEAD(grq.queue + i); bitmap_zero(grq.prio_bitmap, PRIO_LIMIT); @@ -5520,7 +5603,7 @@ read_lock_irq(&tasklist_lock); do_each_thread(g, p) { - if (!rt_task(p)) + if (!rt_task(p) && !iso_task(p)) continue; spin_lock_irqsave(&p->pi_lock, flags); Index: linux-2.6.30-bfs/kernel/sysctl.c =================================================================== --- linux-2.6.30-bfs.orig/kernel/sysctl.c 2009-08-30 16:06:43.348393615 +1000 +++ linux-2.6.30-bfs/kernel/sysctl.c 2009-08-30 16:10:32.167392803 +1000 @@ -84,6 +84,7 @@ extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; extern int rr_interval; +extern int sched_iso_cpu; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif @@ -653,6 +654,17 @@ .extra1 = &one, .extra2 = &five_thousand, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "iso_cpu", + .data = &sched_iso_cpu, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #if defined(CONFIG_S390) && defined(CONFIG_SMP) { .ctl_name = KERN_SPIN_RETRY,