Index: linux-2.6.9-rc2-mm1/fs/proc/array.c =================================================================== --- linux-2.6.9-rc2-mm1.orig/fs/proc/array.c 2004-09-19 19:00:22.000000000 +1000 +++ linux-2.6.9-rc2-mm1/fs/proc/array.c 2004-09-19 19:12:46.000000000 +1000 @@ -162,8 +162,7 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "sleep_time:\t%lu\n" - "total_time:\t%lu\n" + "Burst:\t%d\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -171,7 +170,7 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - p->sleep_time, p->total_time, + p->burst, p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, Index: linux-2.6.9-rc2-mm1/include/linux/init_task.h =================================================================== --- linux-2.6.9-rc2-mm1.orig/include/linux/init_task.h 2004-09-19 19:00:22.000000000 +1000 +++ linux-2.6.9-rc2-mm1/include/linux/init_task.h 2004-09-19 19:02:19.000000000 +1000 @@ -71,13 +71,14 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-29, \ - .static_prio = MAX_PRIO-29, \ + .prio = MAX_PRIO-20, \ + .static_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ Index: linux-2.6.9-rc2-mm1/include/linux/sched.h =================================================================== --- linux-2.6.9-rc2-mm1.orig/include/linux/sched.h 2004-09-19 19:00:22.000000000 +1000 +++ linux-2.6.9-rc2-mm1/include/linux/sched.h 2004-09-19 19:13:29.000000000 +1000 @@ -165,6 +165,7 @@ extern void show_stack(struct task_struc void io_schedule(void); long io_schedule_timeout(long timeout); +extern int sched_interactive, sched_compute; extern void cpu_init (void); extern void trap_init(void); @@ -329,7 +330,7 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 59) +#define MAX_PRIO (MAX_RT_PRIO + 40) #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) @@ -360,7 +361,6 @@ extern struct user_struct *find_user(uid extern struct user_struct root_user; #define INIT_USER (&root_user) -typedef struct prio_array prio_array_t; struct backing_dev_info; struct reclaim_state; @@ -601,17 +601,13 @@ struct task_struct { int prio, static_prio; struct list_head run_list; - prio_array_t *array; - - /* Scheduler variables follow. kernel/sched.c */ - unsigned long array_sequence; unsigned long long timestamp; - int used_slice; - - unsigned long total_time, sleep_time; + unsigned long runtime, totalrun; + unsigned int burst; unsigned long policy; cpumask_t cpus_allowed; + unsigned long slice, time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -792,6 +788,9 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ +#define PF_FORKED 0x00800000 /* I have just forked */ +#define PF_YIELDED 0x01000000 /* I have just yielded */ +#define PF_UISLEEP 0x02000000 /* Uninterruptible sleep */ #ifdef CONFIG_SMP extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); @@ -876,7 +875,6 @@ extern void FASTCALL(wake_up_new_task(st static inline void kick_process(struct task_struct *tsk) { } #endif extern void FASTCALL(sched_fork(task_t * p)); -extern void FASTCALL(sched_exit(task_t * p)); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); Index: linux-2.6.9-rc2-mm1/include/linux/sysctl.h =================================================================== --- linux-2.6.9-rc2-mm1.orig/include/linux/sysctl.h 2004-09-19 19:00:22.000000000 +1000 +++ linux-2.6.9-rc2-mm1/include/linux/sysctl.h 2004-09-19 19:12:46.000000000 +1000 @@ -134,7 +134,8 @@ enum KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */ KERN_HZ_TIMER=65, /* int: hz timer on or off */ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ - KERN_SCHED_TIMESLICE=67, /* int: base timeslice for scheduler */ + KERN_INTERACTIVE=67, /* interactive tasks can have cpu bursts */ + KERN_COMPUTE=68, /* adjust timeslices for a compute server */ }; Index: linux-2.6.9-rc2-mm1/init/main.c =================================================================== --- linux-2.6.9-rc2-mm1.orig/init/main.c 2004-09-19 18:59:02.000000000 +1000 +++ linux-2.6.9-rc2-mm1/init/main.c 2004-09-19 19:12:46.000000000 +1000 @@ -684,6 +684,7 @@ static inline void fixup_cpu_present_map static int init(void * unused) { lock_kernel(); + current->prio = MAX_PRIO - 1; /* * Tell the world that we're going to be the grim * reaper of innocent orphaned children. Index: linux-2.6.9-rc2-mm1/kernel/exit.c =================================================================== --- linux-2.6.9-rc2-mm1.orig/kernel/exit.c 2004-09-19 18:59:02.000000000 +1000 +++ linux-2.6.9-rc2-mm1/kernel/exit.c 2004-09-19 19:13:51.000000000 +1000 @@ -93,7 +93,6 @@ repeat: } perfctr_release_task(p); - sched_exit(p); write_unlock_irq(&tasklist_lock); spin_unlock(&p->proc_lock); proc_pid_flush(proc_dentry); Index: linux-2.6.9-rc2-mm1/kernel/sched.c =================================================================== --- linux-2.6.9-rc2-mm1.orig/kernel/sched.c 2004-09-19 19:00:29.000000000 +1000 +++ linux-2.6.9-rc2-mm1/kernel/sched.c 2004-09-19 19:15:53.000000000 +1000 @@ -16,6 +16,8 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-09-13 New staircase scheduling policy by Con Kolivas with help + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. */ #include @@ -54,68 +56,35 @@ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 30) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 30) +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 58 ] range. + * it's a [ 0 ... 39 ] range. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) -#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) - -/* - * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there - * is a maximum priority process runnable. MAX_TIMESLICE is derived from the - * formula in task_timeslice. It cannot be changed here. It is the timesilce - * that the maximum priority process will get. Larger timeslices are attainable - * by low priority processes however. - */ -int sched_base_timeslice = 64; -int sched_min_base = 1; -int sched_max_base = 10000; - -#define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ -#define BASE_TIMESLICE (sched_base_timeslice) -#define MIN_TIMESLICE 1 - -/* Maximum amount of history that will be used to calculate priority */ -#define MAX_SLEEP_SHIFT 19 -#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */ - -/* - * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is - * will moderate dicard freak events (eg. SIGSTOP) - */ -#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) - -/* - * The amount of history can be decreased (on fork for example). This puts a - * lower bound on it. - */ -#define MIN_HISTORY (MAX_SLEEP/8) - -#define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) - /* - * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. - * In particular: total_time, sleep_time, sleep_avg. + * Some helpers for time to/from microsecond. (>> 10) approximates (/ 1000) + * to avoid 64 bit division. */ -#define SLEEP_FACTOR 1024 - -/* - * The scheduler classifies a process as performing one of the following - * activities +#define NS_TO_US(TIME) ((TIME) >> 10) +#define JIFFIES_TO_US(TIME) ((TIME) * (1000000 / HZ)) +#define US_TO_JIFFIES(TIME) ((TIME) / (1000000 / HZ)) + +int sched_compute = 0; +/* + *This is the time all tasks within the same priority round robin. + *compute setting is reserved for dedicated computational scheduling + *and has ten times larger intervals. */ -#define STIME_SLEEP 1 /* Sleeping */ -#define STIME_RUN 2 /* Using CPU */ - -#define TASK_PREEMPTS_CURR(p, rq) ( (p)->prio < (rq)->curr->prio ) +#define _RR_INTERVAL (10000) /* microseconds */ +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 9 * sched_compute)) #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) @@ -123,17 +92,8 @@ int sched_max_base = 10000; * These are the runqueue data structures: */ -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - typedef struct runqueue runqueue_t; -struct prio_array { - int min_prio; - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; - /* * This is the main, per-CPU runqueue data structure. * @@ -152,17 +112,17 @@ struct runqueue { #ifdef CONFIG_SMP unsigned long cpu_load; #endif - unsigned long array_sequence; - unsigned long nr_uninterruptible; unsigned long long nr_switches; + unsigned long nr_uninterruptible; + unsigned long long timestamp_last_tick; + unsigned int cache_ticks, preempted; task_t *curr, *idle; struct mm_struct *prev_mm; + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO+1)]; + struct list_head queue[MAX_PRIO + 1]; atomic_t nr_iowait; - prio_array_t *active, *expired, arrays[2]; #ifdef CONFIG_SMP - unsigned long long timestamp_last_tick; - struct sched_domain *sd; /* For active balancing */ @@ -359,6 +319,20 @@ struct file_operations proc_schedstat_op # define schedstat_add(rq, field, amt) do { } while (0); #endif +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static runqueue_t *this_rq_lock(void) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + static inline void rq_unlock(runqueue_t *rq) { spin_unlock_irq(&rq->lock); @@ -470,34 +444,26 @@ static inline void sched_info_switch(tas #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ +static inline int task_queued(task_t *task) +{ + return !list_empty(&task->run_list); +} + /* - * Adding/removing a task to/from a priority array: + * Adding/removing a task to/from a runqueue: */ -static void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, runqueue_t *rq) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + list_del_init(&p->run_list); + if (list_empty(rq->queue + p->prio)) + __clear_bit(p->prio, rq->bitmap); } -static void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, runqueue_t *rq) { - struct list_head *entry = array->queue + p->prio; + list_add_tail(&p->run_list, rq->queue + p->prio); + __set_bit(p->prio, rq->bitmap); sched_info_queued(p); - - if (!rt_task(p)) { - /* - * Cycle tasks on the same priority level. This reduces their - * timeslice fluctuations due to higher priority tasks expiring. - */ - if (!list_empty(entry)) - entry = entry->next; - } - list_add_tail(&p->run_list, entry); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; } /* @@ -505,142 +471,153 @@ static void enqueue_task(struct task_str * remote queue so we want these tasks to show up at the head of the * local queue: */ -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +static void enqueue_task_head(struct task_struct *p, runqueue_t *rq) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + list_add(&p->run_list, rq->queue + p->prio); + __set_bit(p->prio, rq->bitmap); } -static inline unsigned long long clock_us(void) +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) { - return sched_clock() >> 10; + enqueue_task(p, rq); + rq->nr_running++; } /* - * add_task_time updates a task @p after @time of doing the specified @type - * of activity. See STIME_*. This is used for priority calculation. + * __activate_idle_task - move idle task to the _front_ of runqueue. */ -static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type) +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - unsigned long ratio; - unsigned long long tmp; - unsigned long t; - - if (type == STIME_SLEEP) { - if (time > MAX_SLEEP_AFFECT*4) - time = MAX_SLEEP_AFFECT*4; - t = ((unsigned long)time + 3) / 4; - } else { - unsigned long div = 60 - USER_PRIO(p->static_prio); - t = (unsigned long)time * 30; - t = t / div; - t = t * 30; - t = t / div; - } - - ratio = MAX_SLEEP - t; - tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2; - tmp >>= MAX_SLEEP_SHIFT; - p->total_time = (unsigned long)tmp; + enqueue_task_head(p, rq); + rq->nr_running++; +} - tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2; - tmp >>= MAX_SLEEP_SHIFT; - p->sleep_time = (unsigned long)tmp; +/* + * burst - extra intervals an interactive task can run for at best priority + * instead of descending priorities. + */ +static unsigned int burst(task_t *p) +{ + if (likely(!rt_task(p))) { + unsigned int task_user_prio = TASK_USER_PRIO(p); + return 39 - task_user_prio; + } else + return p->burst; +} - p->total_time += t; - if (type == STIME_SLEEP) - p->sleep_time += t; +static void inc_burst(task_t *p) +{ + unsigned int best_burst; + best_burst = burst(p); + if (p->burst < best_burst) + p->burst++; } -static unsigned long task_sleep_avg(task_t *p) +static void dec_burst(task_t *p) { - return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1); + if (p->burst) + p->burst--; } /* - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - * - * Timeslices are scaled, so if only low priority processes are running, - * they will all get long timeslices. + * slice - the duration a task runs before getting requeued at it's best + * priority and has it's burst decremented. */ -static int task_timeslice(task_t *p, runqueue_t *rq) +static unsigned long slice(task_t *p) { - int idx, base, delta; - int timeslice; - - if (rt_task(p)) - return RT_TIMESLICE; - - idx = min(p->prio, rq->expired->min_prio); - delta = p->prio - idx; - base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); - - base = base * 40 / (70 - USER_PRIO(idx)); - base = base * 40 / (70 - USER_PRIO(idx)); - - timeslice = base * 1000 / HZ; - timeslice >>= 5; - if (timeslice < MIN_TIMESLICE) - timeslice = MIN_TIMESLICE; - - return timeslice; + unsigned long slice = RR_INTERVAL(); + if (likely(!rt_task(p))) + slice += burst(p) * RR_INTERVAL(); + return slice; } /* - * task_priority: calculates a task's priority based on previous running - * history (see add_task_time). The priority is just a simple linear function - * based on sleep_avg and static_prio. + * sched_interactive - sysctl which allows interactive tasks to have bursts */ -static int task_priority(task_t *p) -{ - unsigned long sleep_avg; - int bonus, prio; +int sched_interactive = 1; +/* + * effective_prio - dynamic priority dependent on burst. + * The priority normally decreases by one each RR_INTERVAL. + * As the burst increases the priority stays at the top "stair" or + * priority for longer. + */ +static int effective_prio(task_t *p) +{ + int prio; + unsigned long full_slice, used_slice, first_slice; + unsigned int best_burst; if (rt_task(p)) return p->prio; - sleep_avg = task_sleep_avg(p); - - prio = USER_PRIO(p->static_prio) + 10; - bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) - / SLEEP_FACTOR; - prio = MAX_RT_PRIO + prio - bonus; - - if (prio < MAX_RT_PRIO) - return MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - return MAX_PRIO-1; - + best_burst = burst(p); + full_slice = slice(p); + used_slice = full_slice - p->slice; + if (p->burst > best_burst) + p->burst = best_burst; + first_slice = RR_INTERVAL(); + if (sched_interactive && !sched_compute && p->mm) + first_slice *= (p->burst + 1); + prio = MAX_PRIO - 1 - best_burst; + + if (used_slice < first_slice) + return prio; + prio += 1 + (used_slice - first_slice) / RR_INTERVAL(); + if (prio > MAX_PRIO - 1) + prio = MAX_PRIO - 1; return prio; } /* - * __activate_task - move a task to the runqueue. - */ -static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array) -{ - enqueue_task(p, array); - rq->nr_running++; - if (!rt_task(p)) { - if (p->prio < array->min_prio) - array->min_prio = p->prio; + * recalc_task_prio - this checks for tasks that run ultra short timeslices + * or have just forked a thread/process and make them continue their old + * slice instead of starting a new one at high priority. + */ +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long long _sleep_time = now - p->timestamp; + unsigned long sleep_time = NS_TO_US(_sleep_time); + unsigned long rr = RR_INTERVAL(); + unsigned int best_burst = burst(p); + unsigned long minrun = rr * (p->burst + 1) / (best_burst + 1) ? : 1; + + if (p->flags & PF_FORKED || (p->mm && + (p->runtime + sleep_time < minrun || + ((!sched_interactive || sched_compute) && + p->runtime + sleep_time < rr)))) { + unsigned long total_run = p->totalrun + p->runtime; + p->flags &= ~PF_FORKED; + if (p->slice - total_run < 1) { + p->totalrun = 0; + dec_burst(p); + } else { + unsigned int intervals = total_run / rr; + unsigned long remainder; + p->totalrun = total_run; + p->slice -= intervals * rr; + if (p->slice <= rr) { + p->totalrun = 0; + dec_burst(p); + } else { + remainder = p->slice % rr; + if (remainder) + p->time_slice = remainder; + } + } + } else { + if (p->totalrun > (best_burst - p->burst) * rr) + dec_burst(p); + else if (!((p->flags & PF_UISLEEP) || p->totalrun)) + inc_burst(p); + p->runtime = 0; + p->totalrun = 0; } } /* - * __activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void __activate_idle_task(task_t *p, runqueue_t *rq) -{ - enqueue_task_head(p, rq->active); - rq->nr_running++; -} - -/* * activate_task - move a task to the runqueue and do priority recalculation * * Update all the scheduling statistics stuff. (sleep average @@ -648,10 +625,7 @@ static inline void __activate_idle_task( */ static void activate_task(task_t *p, runqueue_t *rq, int local) { - unsigned long long now, sleep; - prio_array_t *array; - - now = clock_us(); + unsigned long long now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ @@ -660,36 +634,26 @@ static void activate_task(task_t *p, run + rq->timestamp_last_tick; } #endif - /* - * If we have slept through an active/expired array switch, restart - * our timeslice too. - */ - - sleep = now - p->timestamp; + p->slice = slice(p); + p->time_slice = RR_INTERVAL(); + recalc_task_prio(p, now); + p->flags &= ~PF_UISLEEP; + p->prio = effective_prio(p); p->timestamp = now; - add_task_time(p, sleep, STIME_SLEEP); - p->prio = task_priority(p); - - array = rq->active; - if (unlikely(p->used_slice == -1)) { - /* This only applys to newly woken children */ - array = rq->expired; - p->used_slice = 0; - } else if (rq->array_sequence != p->array_sequence) - p->used_slice = 0; - - __activate_task(p, rq, array); + __activate_task(p, rq); } /* * deactivate_task - remove a task from the runqueue. */ -static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - p->array_sequence = rq->array_sequence; rq->nr_running--; - dequeue_task(p, p->array); - p->array = NULL; + if (p->state == TASK_UNINTERRUPTIBLE) { + p->flags |= PF_UISLEEP; + rq->nr_uninterruptible++; + } + dequeue_task(p, rq); } /* @@ -730,6 +694,29 @@ inline int task_curr(const task_t *p) return cpu_curr(task_cpu(p)) == p; } +/* + * cache_delay is the time preemption is delayed in sched_compute mode + * and is set to 5*cache_decay_ticks + */ +static int cache_delay = 10 * HZ / 1000; + +static void preempt(task_t *task, runqueue_t *rq) +{ + task_t *curr = rq->curr; + + if (task->prio > curr->prio) + return; + else if (task->prio == curr->prio && + (task->totalrun || rt_task(curr))) + return; + else if (sched_compute && rq->cache_ticks < cache_delay && + task->mm && !rt_task(task)) { + rq->preempted = 1; + return; + } + resched_task(curr); +} + #ifdef CONFIG_SMP enum request_type { REQ_MOVE_TASK, @@ -762,7 +749,7 @@ static int migrate_task(task_t *p, int d * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!task_queued(p) && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -793,7 +780,7 @@ void wait_task_inactive(task_t * p) repeat: rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array)) { + if (unlikely(task_queued(p))) { /* If it's preempted, we yield. It could be a while. */ preempted = !task_running(rq, p); task_rq_unlock(rq, &flags); @@ -922,7 +909,7 @@ static int try_to_wake_up(task_t * p, un if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; cpu = task_cpu(p); @@ -1001,7 +988,7 @@ out_set_cpu: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; this_cpu = smp_processor_id(); @@ -1012,12 +999,18 @@ out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - activate_task(p, rq, cpu == this_cpu); - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) + preempt(p, rq); success = 1; out_running: @@ -1052,9 +1045,6 @@ static int find_idlest_cpu(struct task_s */ void fastcall sched_fork(task_t *p) { - unsigned long sleep_avg; - runqueue_t *rq; - /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1063,7 +1053,6 @@ void fastcall sched_fork(task_t *p) */ p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); - p->array = NULL; spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -1077,42 +1066,6 @@ void fastcall sched_fork(task_t *p) */ p->thread_info->preempt_count = 1; #endif - - preempt_disable(); - rq = this_rq(); - - /* XXX */ - if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) { - static int warned = 0; - if (!warned) { - printk(KERN_INFO "Renicing %s for you\n", p->comm); - warned = 1; - } - p->static_prio = NICE_TO_PRIO(-10); - } - - /* Get MIN_HISTORY of history with the same sleep_avg as parent. */ - sleep_avg = task_sleep_avg(current); - p->total_time = MIN_HISTORY; - p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR; - - /* Parent loses 1/4 of sleep time for forking */ - current->sleep_time = 3*current->sleep_time/4; - - p->used_slice = 0; - local_irq_disable(); - if (unlikely(current->used_slice == -1 || current == rq->idle)) - p->used_slice = -1; - else { - int ts = task_timeslice(current, rq); - current->used_slice += (ts + 3) / 4; - if (current->used_slice >= ts) { - current->used_slice = -1; - set_need_resched(); - } - } - local_irq_enable(); - preempt_enable(); } /* @@ -1126,76 +1079,42 @@ void fastcall wake_up_new_task(task_t * { unsigned long flags; int this_cpu, cpu; - runqueue_t *rq; - prio_array_t *array; + runqueue_t *rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); - p->prio = task_priority(p); - p->timestamp = clock_us(); - - rq = task_rq_lock(p, &flags); - this_cpu = smp_processor_id(); + /* + * Forked process gets no burst to prevent fork bombs. + */ + p->burst = 0; + p->parent->flags |= PF_FORKED; cpu = task_cpu(p); + this_cpu = smp_processor_id(); schedstat_inc(rq, wunt_cnt); - array = rq->active; - if (unlikely(p->used_slice == -1)) { - p->used_slice = 0; - array = rq->expired; - } else { - int total = task_timeslice(p, rq); - int ts = max((total + 3) / 4, MIN_TIMESLICE); - ts = min(ts, (int)FORKED_TS_MAX); - p->used_slice = total - ts; - } - if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM) && likely(array == rq->active)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - if (p->prio >= current->prio) { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; - } else - __activate_task(p, rq, array); - - set_need_resched(); - } else { - /* Run child last */ - __activate_task(p, rq, array); + if (unlikely(!task_queued(current))) + __activate_task(p, rq); + else { + list_add_tail(&p->run_list, ¤t->run_list); + rq->nr_running++; } -#ifdef CONFIG_SMP } else { - runqueue_t *this_rq = this_rq(); - + runqueue_t *this_rq = cpu_rq(this_cpu); /* * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; - __activate_task(p, rq, array); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + preempt(p, rq); schedstat_inc(rq, wunt_moved); -#endif } task_rq_unlock(rq, &flags); } -void fastcall sched_exit(task_t * p) -{ -} - /** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. @@ -1492,27 +1411,21 @@ out: * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +static inline void pull_task +(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, int this_cpu) { - dequeue_task(p, src_array); + dequeue_task(p, src_rq); src_rq->nr_running--; set_task_cpu(p, this_cpu); this_rq->nr_running++; - enqueue_task(p, this_array); - if (!rt_task(p)) { - if (p->prio < this_array->min_prio) - this_array->min_prio = p->prio; - } + enqueue_task(p, this_rq); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + preempt(p, this_rq); } /* @@ -1554,7 +1467,6 @@ static int move_tasks(runqueue_t *this_r unsigned long max_nr_move, struct sched_domain *sd, enum idle_type idle) { - prio_array_t *array, *dst_array; struct list_head *head, *curr; int idx, pulled = 0; task_t *tmp; @@ -1562,38 +1474,17 @@ static int move_tasks(runqueue_t *this_r if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: /* Start searching at priority 0: */ idx = 0; skip_bitmap: if (!idx) - idx = sched_find_first_bit(array->bitmap); + idx = sched_find_first_bit(busiest->bitmap); else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) goto out; - } - head = array->queue + idx; + head = busiest->queue + idx; curr = head->prev; skip_queue: tmp = list_entry(curr, task_t, run_list); @@ -1615,7 +1506,7 @@ skip_queue: schedstat_inc(this_rq, pt_gained[idle]); schedstat_inc(busiest, pt_lost[idle]); - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(busiest, tmp, this_rq, this_cpu); pulled++; /* We only want to steal up to the prescribed number of tasks. */ @@ -1802,6 +1693,7 @@ static int load_balance(int this_cpu, ru unsigned long imbalance; int nr_moved; + spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); group = find_busiest_group(sd, this_cpu, &imbalance, idle); @@ -1836,11 +1728,12 @@ static int load_balance(int this_cpu, ru * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_rq_lock(this_rq, busiest); + double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); - double_rq_unlock(this_rq, busiest); + spin_unlock(&busiest->lock); } + spin_unlock(&this_rq->lock); if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); @@ -1874,6 +1767,8 @@ static int load_balance(int this_cpu, ru return nr_moved; out_balanced: + spin_unlock(&this_rq->lock); + /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -2075,10 +1970,11 @@ static inline void idle_balance(int cpu, } #endif -static inline int wake_priority_sleeper(runqueue_t *rq) +static int wake_priority_sleeper(runqueue_t *rq) { - int ret = 0; #ifdef CONFIG_SCHED_SMT + int ret = 0; + spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority @@ -2089,8 +1985,10 @@ static inline int wake_priority_sleeper( ret = 1; } spin_unlock(&rq->lock); -#endif return ret; +#else /* if we dont ifdef it out like this it wont be optimised out */ + return 0; +#endif } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -2098,24 +1996,42 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); /* + * Tasks that run out of time_slice but still have slice left get + * requeued with a lower priority && rr_interval time_slice. + */ +static void time_slice_expired(task_t *p, runqueue_t *rq) +{ + set_tsk_need_resched(p); + dequeue_task(p, rq); + p->prio = effective_prio(p); + p->time_slice = RR_INTERVAL(); + enqueue_task(p, rq); +} + +/* + * Tasks lose burst each time they use up a full slice(). + */ +static void slice_expired(task_t *p, runqueue_t *rq) +{ + dec_burst(p); + p->slice = slice(p); + time_slice_expired(p, rq); +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. */ void scheduler_tick(int user_ticks, int sys_ticks) { - enum idle_type cpu_status; int cpu = smp_processor_id(); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; runqueue_t *rq = this_rq(); task_t *p = current; - int ts; + unsigned long long _decrement; + long decrement; -#ifdef CONFIG_SMP - rq->timestamp_last_tick = clock_us(); -#endif + rq->timestamp_last_tick = sched_clock(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -2129,7 +2045,6 @@ void scheduler_tick(int user_ticks, int sys_ticks = 0; } - cpu_status = NOT_IDLE; if (p == rq->idle) { if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; @@ -2137,36 +2052,51 @@ void scheduler_tick(int user_ticks, int cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - cpu_status = SCHED_IDLE; - goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; } if (TASK_NICE(p) > 0) cpustat->nice += user_ticks; else cpustat->user += user_ticks; cpustat->system += sys_ticks; - - /* Task might have expired already, but not scheduled off yet */ - if (unlikely(p->used_slice == -1)) - goto out; - + /* + * SCHED_FIFO tasks never run out of timeslice. + */ if (unlikely(p->policy == SCHED_FIFO)) goto out; - /* p was running during this tick. Update its time slice counter. */ - p->used_slice++; - ts = task_timeslice(p, rq); - if (unlikely(p->used_slice >= ts)) { - p->used_slice = -1; - set_tsk_need_resched(p); - } + spin_lock(&rq->lock); + rq->cache_ticks++; + decrement = JIFFIES_TO_US(1); + _decrement = rq->timestamp_last_tick - p->timestamp; + _decrement = NS_TO_US(_decrement); + if (_decrement > 0 && _decrement < decrement) + decrement = _decrement; + if (p->slice > decrement && US_TO_JIFFIES(p->slice - decrement)) + p->slice -= decrement; + else { + slice_expired(p, rq); + goto out_unlock; + } + if (p->time_slice > decrement && + US_TO_JIFFIES(p->time_slice - decrement)) + p->time_slice -= decrement; + else { + time_slice_expired(p, rq); + goto out_unlock; + } + if (rq->preempted && rq->cache_ticks >= cache_delay) + set_tsk_need_resched(p); +out_unlock: + spin_unlock(&rq->lock); out: - rebalance_tick(cpu, rq, cpu_status); + rebalance_tick(cpu, rq, NOT_IDLE); } #ifdef CONFIG_SCHED_SMT -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; @@ -2211,11 +2141,10 @@ static inline void wake_sleeping_depende */ } -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; - prio_array_t *array; int ret = 0, i; task_t *p; @@ -2238,12 +2167,8 @@ static inline int dependent_sleeper(int */ if (!this_rq->nr_running) goto out_unlock; - array = this_rq->active; - if (!array->nr_active) - array = this_rq->expired; - BUG_ON(!array->nr_active); - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next, task_t, run_list); for_each_cpu_mask(i, sibling_map) { @@ -2258,7 +2183,8 @@ static inline int dependent_sleeper(int * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if ((smt_curr->static_prio + 5 < p->static_prio) && + if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) > + slice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) ret = 1; @@ -2267,10 +2193,11 @@ static inline int dependent_sleeper(int * or wake it up if it has been put to sleep for priority * reasons. */ - if ((p->static_prio + 5 < smt_curr->static_prio && + if ((((p->slice * (100 - sd->per_cpu_gain) / 100) > + slice(smt_curr) || rt_task(p)) && smt_curr->mm && p->mm && !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) - resched_task(smt_curr); + resched_task(smt_curr); } out_unlock: for_each_cpu_mask(i, sibling_map) @@ -2296,10 +2223,8 @@ asmlinkage void __sched schedule(void) long *switch_count; task_t *prev, *next; runqueue_t *rq; - prio_array_t *array; struct list_head *queue; unsigned long long now; - unsigned long run_time; int cpu, idx; /* @@ -2307,11 +2232,13 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic()) && - likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { - printk(KERN_ERR "scheduling while atomic: %s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); + if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } } need_resched: @@ -2330,11 +2257,27 @@ need_resched: release_kernel_lock(prev); schedstat_inc(rq, sched_cnt); - now = clock_us(); - run_time = now - prev->timestamp; + now = sched_clock(); + prev->runtime = NS_TO_US(now - prev->timestamp) ? : 1; + if (prev->mm && prev->policy != SCHED_FIFO && + prev->state == TASK_RUNNING && + prev->timestamp > rq->timestamp_last_tick) { + /* + * We have not run through a scheduler_tick and are + * still running so charge us with the runtime. + */ + if (unlikely(US_TO_JIFFIES(prev->slice - + prev->runtime) < 1)) + slice_expired(prev, rq); + else if (unlikely(US_TO_JIFFIES(prev->time_slice - + prev->runtime) < 1)) + time_slice_expired(prev, rq); + else { + prev->slice -= prev->runtime; + prev->time_slice -= prev->runtime; + } + } prev->timestamp = now; - add_task_time(prev, run_time, STIME_RUN); - spin_lock_irq(&rq->lock); /* @@ -2347,40 +2290,16 @@ need_resched: if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; - else { + else deactivate_task(prev, rq); - if (prev->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; - goto no_check_expired; - } } - if (unlikely(prev->used_slice == -1)) { - if (rt_task(prev)) { - /* SCHED_FIFO can come in here too, from sched_yield */ - dequeue_task(prev, prev->array); - enqueue_task(prev, rq->active); - } else { - dequeue_task(prev, prev->array); - prev->prio = task_priority(prev); - enqueue_task(prev, rq->expired); - if (prev->prio < rq->expired->min_prio) - rq->expired->min_prio = prev->prio; - } - prev->used_slice = 0; - } -no_check_expired: - cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { go_idle: - rq->array_sequence++; idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; - rq->arrays[0].min_prio = MAX_PRIO; - rq->arrays[1].min_prio = MAX_PRIO; - wake_sleeping_dependent(cpu, rq); /* * wake_sleeping_dependent() might have released @@ -2405,22 +2324,10 @@ go_idle: goto go_idle; } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->array_sequence++; - rq->active = rq->expired; - rq->expired = array; - rq->expired->min_prio = MAX_PRIO; - array = rq->active; - } else - schedstat_inc(rq, sched_noswitch); + schedstat_inc(rq, sched_noswitch); - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; + idx = sched_find_first_bit(rq->bitmap); + queue = rq->queue + idx; next = list_entry(queue->next, task_t, run_list); switch_tasks: @@ -2428,8 +2335,17 @@ switch_tasks: clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); + if (next->flags & PF_YIELDED) { + next->flags &= ~PF_YIELDED; + dequeue_task(next, rq); + next->prio = effective_prio(next); + enqueue_task_head(next, rq); + } + sched_info_switch(prev, next); if (likely(prev != next)) { + rq->preempted = 0; + rq->cache_ticks = 0; next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -2692,9 +2608,8 @@ EXPORT_SYMBOL(sleep_on_timeout); void set_user_nice(task_t *p, long nice) { unsigned long flags; - prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; + int queued, old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -2713,9 +2628,8 @@ void set_user_nice(task_t *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; - if (array) - dequeue_task(p, array); + if ((queued = task_queued(p))) + dequeue_task(p, rq); old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); @@ -2723,8 +2637,8 @@ void set_user_nice(task_t *p, long nice) p->static_prio = NICE_TO_PRIO(nice); p->prio += delta; - if (array) { - enqueue_task(p, array); + if (queued) { + enqueue_task(p, rq); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -2836,7 +2750,7 @@ static inline task_t *find_process_by_pi /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(task_queued(p)); p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) @@ -2852,8 +2766,7 @@ static int setscheduler(pid_t pid, int p { struct sched_param lp; int retval = -EINVAL; - int oldprio; - prio_array_t *array; + int queued, oldprio; unsigned long flags; runqueue_t *rq; task_t *p; @@ -2914,17 +2827,13 @@ static int setscheduler(pid_t pid, int p if (retval) goto out_unlock; - array = p->array; - if (array) - deactivate_task(p, rq); + if ((queued = task_queued(p))) + deactivate_task(p, task_rq(p)); retval = 0; oldprio = p->prio; __setscheduler(p, policy, lp.sched_priority); - if (policy == SCHED_FIFO || policy == SCHED_RR) - p->used_slice = 0; - - if (array) { - __activate_task(p, rq, array); + if (queued) { + __activate_task(p, task_rq(p)); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -2933,8 +2842,8 @@ static int setscheduler(pid_t pid, int p if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else + preempt(p, rq); } out_unlock: @@ -3171,37 +3080,33 @@ asmlinkage long sys_sched_getaffinity(pi /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this - * CPU then this function will return. + * this function yields the current CPU by dropping the priority of current + * to the lowest priority and setting the PF_YIELDED flag. */ asmlinkage long sys_sched_yield(void) { -#ifdef CONFIG_SCHEDSTATS - runqueue_t *rq; -#endif - - local_irq_disable(); -#ifdef CONFIG_SCHEDSTATS - rq = this_rq(); + runqueue_t *rq = this_rq_lock(); schedstat_inc(rq, yld_cnt); - spin_lock(&rq->lock); - if (current->array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); + /* + * (special rule: RT tasks will just roundrobin.) + */ + dequeue_task(current, rq); + current->slice = slice(current); + current->time_slice = RR_INTERVAL(); + if (likely(!rt_task(current))) { + current->flags |= PF_YIELDED; + current->prio = MAX_PRIO - 1; + } + current->burst = 0; + enqueue_task(current, rq); + /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); -#endif - current->used_slice = -1; - local_irq_enable(); schedule(); @@ -3376,8 +3281,6 @@ long sys_sched_rr_get_interval(pid_t pid int retval = -EINVAL; struct timespec t; task_t *p; - unsigned long flags; - runqueue_t *rq; if (pid < 0) goto out_nounlock; @@ -3392,9 +3295,8 @@ long sys_sched_rr_get_interval(pid_t pid if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t); - task_rq_unlock(rq, &flags); + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : slice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -3507,10 +3409,9 @@ void __devinit init_idle(task_t *idle, i runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; - idle->used_slice = 0; + idle->burst = 0; set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); @@ -3624,7 +3525,7 @@ static void __migrate_task(struct task_s goto out; set_task_cpu(p, dest_cpu); - if (p->array) { + if (task_queued(p)) { /* * Sync timestamp with rq_dest's before activating. * The same thing could be achieved by doing this step @@ -3635,8 +3536,7 @@ static void __migrate_task(struct task_s + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); activate_task(p, rq_dest, 0); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + preempt(p, rq_dest); } out: @@ -4364,15 +4264,15 @@ int in_sched_functions(unsigned long add void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i, j; for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; rq = cpu_rq(i); spin_lock_init(&rq->lock); - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; + + rq->cache_ticks = 0; + rq->preempted = 0; #ifdef CONFIG_SMP rq->sd = &sched_domain_dummy; @@ -4381,19 +4281,16 @@ void __init sched_init(void) rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); + cache_delay = cache_decay_ticks * 5; #endif atomic_set(&rq->nr_iowait, 0); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - array->min_prio = MAX_PRIO; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - /* delimiter for bitsearch */ - __set_bit(MAX_PRIO, array->bitmap); - } + for (j = 0; j <= MAX_PRIO; j++) + INIT_LIST_HEAD(&rq->queue[j]); + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO+1)*sizeof(long)); + /* + * delimiter for bitsearch + */ + __set_bit(MAX_PRIO, rq->bitmap); } /* Index: linux-2.6.9-rc2-mm1/kernel/sysctl.c =================================================================== --- linux-2.6.9-rc2-mm1.orig/kernel/sysctl.c 2004-09-19 19:00:22.000000000 +1000 +++ linux-2.6.9-rc2-mm1/kernel/sysctl.c 2004-09-19 19:12:46.000000000 +1000 @@ -65,9 +65,6 @@ extern int sysctl_lower_zone_protection; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; -extern int sched_base_timeslice; -extern int sched_min_base; -extern int sched_max_base; extern int pid_max_min, pid_max_max; #if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__) @@ -626,18 +623,6 @@ static ctl_table kern_table[] = { .proc_handler = &proc_unknown_nmi_panic, }, #endif - { - .ctl_name = KERN_SCHED_TIMESLICE, - .procname = "base_timeslice", - .data = &sched_base_timeslice, - .maxlen = sizeof (sched_base_timeslice), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &sched_min_base, - .extra2 = &sched_max_base, - }, - { .ctl_name = 0 } }; @@ -929,7 +914,22 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - + { + .ctl_name = KERN_INTERACTIVE, + .procname = "interactive", + .data = &sched_interactive, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_COMPUTE, + .procname = "compute", + .data = &sched_compute, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; Index: linux-2.6.9-rc2-mm1/mm/oom_kill.c =================================================================== --- linux-2.6.9-rc2-mm1.orig/mm/oom_kill.c 2004-09-19 19:00:22.000000000 +1000 +++ linux-2.6.9-rc2-mm1/mm/oom_kill.c 2004-09-19 19:02:19.000000000 +1000 @@ -151,10 +151,11 @@ static void __oom_kill_task(task_t *p) printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); /* - * We give our sacrificial lamb access to all the memory it needs. - * That way it should be able to exit() and clear out its resources - * quickly... + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... */ + p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */ More recent patches modify files in from_2.6.9-rc2_to_staircase8.4.