Index: linux-2.6.10-rc1-mm4-plugsched/fs/proc/array.c =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/fs/proc/array.c 2004-11-10 10:00:34.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/fs/proc/array.c 2004-11-10 10:01:07.000000000 +1100 @@ -163,7 +163,6 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -171,7 +170,6 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, p->pid ? p->group_leader->real_parent->tgid : 0, p->pid && p->ptrace ? p->parent->pid : 0, Index: linux-2.6.10-rc1-mm4-plugsched/fs/proc/proc_misc.c =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/fs/proc/proc_misc.c 2004-11-10 10:00:33.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/fs/proc/proc_misc.c 2004-11-10 10:01:13.000000000 +1100 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,18 @@ static int version_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +static int scheduler_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + char *sched_name = scheduler->cpusched_name; + int len; + + strcpy(page, sched_name); + strcat(page, "\n"); + len = strlen(page); + return proc_calc_metrics(page, start, off, count, eof, len); +} + extern struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { @@ -569,6 +582,7 @@ void __init proc_misc_init(void) {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"scheduler", scheduler_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) Index: linux-2.6.10-rc1-mm4-plugsched/include/linux/init_task.h =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/include/linux/init_task.h 2004-11-10 10:00:35.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/include/linux/init_task.h 2004-11-10 10:01:06.000000000 +1100 @@ -72,14 +72,10 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ - .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ @@ -115,5 +111,4 @@ extern struct group_info init_groups; .private_pages = LIST_HEAD_INIT(tsk.private_pages), \ .private_pages_count = 0, \ } - #endif Index: linux-2.6.10-rc1-mm4-plugsched/include/linux/sched.h =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/include/linux/sched.h 2004-11-10 09:17:07.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/include/linux/sched.h 2004-11-10 10:01:20.000000000 +1100 @@ -32,6 +32,7 @@ #include #include #include +#include struct exec_domain; @@ -165,9 +166,6 @@ extern void show_regs(struct pt_regs *); */ extern void show_stack(struct task_struct *task, unsigned long *sp); -void io_schedule(void); -long io_schedule_timeout(long timeout); - extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); @@ -179,6 +177,9 @@ extern unsigned long cache_decay_ticks; /* Is this address in the __sched functions? */ extern int in_sched_functions(unsigned long addr); +void __sched io_schedule(void); +long __sched io_schedule_timeout(long timeout); + #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); @@ -329,11 +330,6 @@ struct signal_struct { }; /* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are - * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values - * are inverted: lower p->prio value means higher priority. - * * The MAX_USER_RT_PRIO value allows the actual maximum * RT priority to be separate from the value exported to * user-space. This allows kernel threads to set their @@ -344,9 +340,7 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) - -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +extern int rt_task(task_t *p); /* * Some day this will be a full-fledged user tracking system.. @@ -464,13 +458,11 @@ struct sched_domain { #endif }; -#ifdef ARCH_HAS_SCHED_DOMAIN /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */ -extern cpumask_t cpu_isolated_map; +extern void cpu_attach_domain(struct sched_domain *sd, int cpu); extern void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)); -extern void cpu_attach_domain(struct sched_domain *sd, int cpu); -#endif /* ARCH_HAS_SCHED_DOMAIN */ +extern cpumask_t cpu_isolated_map; #endif /* CONFIG_SMP */ @@ -514,6 +506,10 @@ int set_current_groups(struct group_info struct audit_context; /* See audit.c */ struct mempolicy; +#include + +extern struct sched_drv *scheduler; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -523,17 +519,11 @@ struct task_struct { int lock_depth; /* Lock depth */ - int prio, static_prio; - struct list_head run_list; - prio_array_t *array; - - unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - int activated; - + int static_prio; /* A commonality between cpu schedulers */ + union cpusched u; + unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -739,6 +729,8 @@ extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); extern int task_curr(const task_t *p); extern int idle_cpu(int cpu); +extern void set_oom_timeslice(task_t *p); +extern task_t *find_process_by_pid(pid_t pid); void yield(void); @@ -764,6 +756,7 @@ static inline int kstack_end(void *addr) extern union thread_union init_thread_union; extern struct task_struct init_task; +extern struct task_struct base_init_task; extern struct mm_struct init_mm; @@ -1085,33 +1078,8 @@ extern void recalc_sigpending(void); extern void signal_wake_up(struct task_struct *t, int resume_stopped); -/* - * Wrappers for p->thread_info->cpu access. No-op on UP. - */ -#ifdef CONFIG_SMP - -static inline unsigned int task_cpu(const struct task_struct *p) -{ - return p->thread_info->cpu; -} - -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - p->thread_info->cpu = cpu; -} - -#else - -static inline unsigned int task_cpu(const struct task_struct *p) -{ - return 0; -} - -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -} - -#endif /* CONFIG_SMP */ +extern unsigned int task_cpu(const struct task_struct *p); +extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT extern void arch_pick_mmap_layout(struct mm_struct *mm); Index: linux-2.6.10-rc1-mm4-plugsched/include/linux/scheduler.h =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/include/linux/scheduler.h 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/include/linux/scheduler.h 2004-11-10 10:01:22.000000000 +1100 @@ -0,0 +1,129 @@ +#ifndef _LINUX_SCHEDULER_H +#define _LINUX_SCHEDULER_H +/* + * include/linux/scheduler.h + * This contains the driver struct for all the exported per-cpu-scheduler + * functions, and the private per-scheduler data in task_struct. + */ +#define SCHED_NAME_MAX (16) + +/* + * This is the main scheduler driver struct. + */ +struct sched_drv +{ + unsigned int (*task_cpu)(const struct task_struct *); + void (*set_task_cpu)(struct task_struct *, unsigned int); + void (*init_sched_domain_sysctl)(void); + void (*destroy_sched_domain_sysctl)(void); + void (*account_steal_time)(struct task_struct *, cputime_t); + void (*account_system_time)(struct task_struct *, int, cputime_t); + void (*account_user_time)(struct task_struct *, cputime_t); + char cpusched_name[SCHED_NAME_MAX]; + int (*rt_task)(task_t *); + void (*wait_for_completion)(struct completion *); + void (*io_schedule)(void); + long (*io_schedule_timeout)(long); + void (*sched_idle_next)(void); + void (*set_oom_timeslice)(task_t *); + unsigned long (*nr_running)(void); + unsigned long (*nr_uninterruptible)(void); + unsigned long long (*nr_context_switches)(void); + unsigned long (*nr_iowait)(void); + int (*idle_cpu)(int); + void (*init_idle)(task_t *, int); + void (*exit)(task_t *); + void (*fork)(task_t *); + void (*init)(void); + void (*init_smp)(void); + void (*schedule)(void); + void (*tick)(void); + void (*tail)(task_t *); + int (*setscheduler)(pid_t, int, struct sched_param __user *); + void (*set_user_nice)(task_t *, long); + long (*rr_get_interval)(pid_t, struct timespec __user *); + long (*yield)(void); + int (*task_curr)(const task_t *); + int (*task_nice)(const task_t *); + int (*task_prio)(const task_t *); + int (*try_to_wake_up)(task_t *, unsigned, int); + void (*wake_up_new_task)(task_t *, unsigned long); +#ifdef CONFIG_SMP + int (*migration_init)(void); + void (*exec)(void); + int (*set_cpus_allowed)(task_t *, cpumask_t); + void (*wait_task_inactive)(task_t *); + void (*cpu_attach_domain)(struct sched_domain *, int); +#endif +#ifdef CONFIG_SCHEDSTATS + int (*show_schedstat)(struct seq_file *, void *); +#endif +#ifdef CONFIG_MAGIC_SYSRQ + void (*normalize_rt_tasks)(void); +#endif +#ifdef CONFIG_KGDB + struct task_struct * (*kgdb_get_idle)(int); +#endif +}; + +/* + * List functions that have common variants that many schedulers use. + */ +extern unsigned int common_task_cpu(const struct task_struct *p); +extern void common_set_task_cpu(struct task_struct *p, unsigned int cpu); + +/* + * All private per-scheduler entries in task_struct are defined here as + * separate structs placed into the cpusched union in task_struct. + */ + +/* Ingosched */ +#ifdef CONFIG_CPUSCHED_INGO +struct cpusched_ingo { + int prio; + struct list_head run_list; + prio_array_t *array; + unsigned int time_slice; + unsigned int first_time_slice; + unsigned long sleep_avg; + unsigned long timestamp; + unsigned long long last_ran; + int activated; +}; +#endif + +/* Staircase scheduler */ +#ifdef CONFIG_CPUSCHED_STAIRCASE +struct cpusched_sc { + int prio; + struct list_head run_list; + unsigned long sflags; + unsigned long long timestamp; + unsigned long runtime, totalrun, ns_debit; + unsigned int burst; + unsigned int slice, time_slice; +}; +#endif + +/* Minisched scheduler */ +#ifdef CONFIG_CPUSCHED_MINISCHED +struct cpusched_ms { + int prio; + struct list_head run_list; + unsigned int time_slice; +}; +#endif + +union cpusched { +#ifdef CONFIG_CPUSCHED_INGO + struct cpusched_ingo ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct cpusched_sc scsched; +#endif +#ifdef CONFIG_CPUSCHED_MINISCHED + struct cpusched_ms mssched; +#endif +}; + +#endif Index: linux-2.6.10-rc1-mm4-plugsched/init/Kconfig =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/init/Kconfig 2004-11-10 10:00:34.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/init/Kconfig 2004-11-10 10:01:22.000000000 +1100 @@ -249,6 +249,48 @@ config IKCONFIG_PROC through /proc/config.gz. +config PLUGSCHED + bool "Support for multiple cpu schedulers" + default y + help + Say Y here if you want to compile in support for multiple + cpu schedulers. The cpu scheduler may be selected at boot time + with the boot parameter "cpusched=". The choice of which cpu + schedulers to compile into the kernel can be made by enabling + "Configure standard kernel features" otherwise all cpu schedulers + supported will be compiled in. + +choice + prompt "Default cpu scheduler" + help + This option allows you to choose which cpu scheduler shall be + booted by default at startup if you have plugsched support, or + it will choose which is the only scheduler compiled in. + +config CPUSCHED_DEFAULT_INGO + bool "Ingosched cpu scheduler" + select CPUSCHED_INGO + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + +config CPUSCHED_DEFAULT_STAIRCASE + bool "Staircase cpu scheduler" + select CPUSCHED_STAIRCASE + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + +config CPUSCHED_DEFAULT_MINISCHED + bool "Minisched cpu scheduler" + depends on !SMP + select CPUSCHED_MINISCHED + ---help--- + This scheduler is a low overhead O(1) single priority rr scheduler + for uniprocessor only. + +endchoice + menuconfig EMBEDDED bool "Configure standard kernel features (for small systems)" help @@ -257,6 +299,36 @@ menuconfig EMBEDDED environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config CPUSCHED_INGO + bool "Ingosched cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=ingosched". + +config CPUSCHED_STAIRCASE + bool "Staircase cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=staircase". + +config CPUSCHED_MINISCHED + bool "Minisched cpu scheduler" if EMBEDDED + depends on PLUGSCHED && !SMP + default y + ---help--- + This scheduler is a low overhead O(1) single priority rr scheduler + for uniprocessor only. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=minisched". + config KALLSYMS bool "Load all symbols for debugging/kksymoops" if EMBEDDED default y Index: linux-2.6.10-rc1-mm4-plugsched/init/main.c =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/init/main.c 2004-11-10 10:00:34.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/init/main.c 2004-11-10 10:01:10.000000000 +1100 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -416,10 +417,11 @@ void __init parse_early_param(void) done = 1; } +struct task_struct base_init_task; + /* * Activate the first processor. */ - asmlinkage void __init start_kernel(void) { char * command_line; @@ -441,6 +443,11 @@ asmlinkage void __init start_kernel(void smp_prepare_boot_cpu(); /* + * Save a copy of the baseline init_task in case we need to start + * another cpu scheduler. + */ + base_init_task = init_task; + /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. @@ -519,6 +526,7 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ + printk("Running with %s cpu scheduler.\n", scheduler->cpusched_name); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } Index: linux-2.6.10-rc1-mm4-plugsched/kernel/Makefile =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/kernel/Makefile 2004-11-10 10:00:35.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/kernel/Makefile 2004-11-10 10:01:22.000000000 +1100 @@ -2,13 +2,16 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ +obj-y = scheduler.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o +obj-$(CONFIG_CPUSCHED_INGO) += sched.o +obj-$(CONFIG_CPUSCHED_STAIRCASE) += staircase.o +obj-$(CONFIG_CPUSCHED_MINISCHED) += minisched.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o Index: linux-2.6.10-rc1-mm4-plugsched/kernel/minisched.c =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/kernel/minisched.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/kernel/minisched.c 2004-11-10 10:19:19.000000000 +1100 @@ -0,0 +1,1112 @@ +/* + * kernel/minisched.c + * + * This is "minisched"; a minimalist uniprocessor scheduler. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_PRIO (MAX_RT_PRIO + 1) + +#define RR_INTERVAL (10 * HZ / 1000 ? : 1) + +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +static unsigned int task_timeslice(task_t *p) +{ + return NICE_TO_PRIO(p->static_prio) * RR_INTERVAL; +} + +typedef struct runqueue runqueue_t; + +/* + * This is the runqueue data structure. + */ +struct runqueue { + spinlock_t lock; + + unsigned long nr_running; + unsigned long long nr_switches; + unsigned long nr_uninterruptible; + task_t *curr, *idle; + struct mm_struct *prev_mm; + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO+1)]; + struct list_head queue[MAX_PRIO + 1]; + atomic_t nr_iowait; +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +static runqueue_t *rq = &per_cpu(runqueues, 0); + +static int ms_rt_task(task_t *p) +{ + return (unlikely((p)->u.mssched.prio < MAX_RT_PRIO)); +} + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +# define finish_arch_switch(next) spin_unlock_irq(&rq->lock) +# define task_running(p) (rq->curr == (p)) +#endif + +/* + * task_rq_lock - lock the runqueue and disable + * interrupts. + */ +static void task_rq_lock(unsigned long *flags) +{ + local_irq_save(*flags); + spin_lock(&rq->lock); +} + +static void task_rq_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +#ifdef CONFIG_SCHEDSTATS +/* + * No schedstats support. + */ +#define SCHEDSTAT_VERSION 10 + +static int ms_show_schedstat(struct seq_file *seq, void *v) +{ + return 0; +} +#endif + +/* + * rq_lock - lock the runqueue and disable interrupts. + */ +static void rq_lock(void) +{ + local_irq_disable(); + spin_lock(&rq->lock); +} + +static int task_queued(task_t *task) +{ + return !list_empty(&task->u.mssched.run_list); +} + +/* + * Adding/removing a task to/from a runqueue: + */ +static void dequeue_task(struct task_struct *p) +{ + list_del_init(&p->u.mssched.run_list); + if (list_empty(rq->queue + p->u.mssched.prio)) + __clear_bit(p->u.mssched.prio, rq->bitmap); +} + +static void enqueue_task(struct task_struct *p) +{ + list_add_tail(&p->u.mssched.run_list, rq->queue + p->u.mssched.prio); + __set_bit(p->u.mssched.prio, rq->bitmap); +} + +static void requeue_task(struct task_struct *p) +{ + list_move_tail(&p->u.mssched.run_list, rq->queue + p->u.mssched.prio); +} + +static void ms_set_oom_timeslice(task_t *p) +{ + p->u.mssched.time_slice = HZ; +} + +static void __activate_task(task_t *p) +{ + enqueue_task(p); + rq->nr_running++; +} + +static void activate_task(task_t *p) +{ + p->u.mssched.time_slice = task_timeslice(p); + __activate_task(p); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p) +{ + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p); +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag; + */ +static inline void resched_task(task_t *p) +{ + set_tsk_need_resched(p); +} + +/** + * task_curr - is this task currently executing? + * @p: the task in question. + */ +static int ms_task_curr(const task_t *p) +{ + return (rq->curr == p); +} + +/* + * Check to see if p preempts rq->curr and resched if it only if it is a + * real time task. + */ +static void preempt(task_t *p) +{ + if (likely(p->u.mssched.prio == rq->curr->u.mssched.prio)) { + /* This is true for all non rt tasks */ + if (p->u.mssched.time_slice > rq->curr->u.mssched.time_slice) + /* This selects out higher priority normal tasks */ + resched_task(rq->curr); + goto out; + } + if (p->u.mssched.prio > rq->curr->u.mssched.prio) + /* + * This is a lower priority real time task or a normal task + * While a real time task is running. + */ + goto out; + resched_task(rq->curr); +out: + return; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int ms_try_to_wake_up(task_t * p, unsigned int state, int sync) +{ + int success = 0; + unsigned long flags; + long old_state; + + task_rq_lock(&flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (task_queued(p)) + goto out_running; + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p); + if (!sync) + preempt(p); + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(&flags); + + return success; +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void ms_sched_fork(task_t *p) +{ + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->u.mssched.run_list); + spin_lock_init(&p->switch_lock); +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#ifdef CONFIG_PREEMPT + /* + * During context-switch we hold precisely one spinlock, which + * schedule_tail drops. (in the common case it's rq->lock, + * but it also can be p->switch_lock.) So we compensate with a count + * of 1. Also, we want to start with kernel preemption disabled. + */ + p->thread_info->preempt_count = 1; +#endif +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void ms_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + + task_rq_lock(&flags); + + BUG_ON(p->state != TASK_RUNNING); + + __activate_task(p); + task_rq_unlock(&flags); +} + +static void ms_sched_exit(task_t * p) +{ +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(task_t *prev) +{ + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +static void ms_schedule_tail(task_t *prev) +{ + finish_task_switch(prev); + + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +static unsigned long ms_nr_running(void) +{ + return rq->nr_running; +} + +static unsigned long ms_nr_uninterruptible(void) +{ + return rq->nr_uninterruptible; +} + +static unsigned long long ms_nr_context_switches(void) +{ + return rq->nr_switches; +} + +static unsigned long ms_nr_iowait(void) +{ + return atomic_read(&rq->nr_iowait); +} + +/* + * Do the virtual cpu time signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + */ +static void account_it_virt(struct task_struct * p, cputime_t cputime) +{ + cputime_t it_virt = p->it_virt_value; + + if (cputime_gt(it_virt, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_virt)) { + it_virt = cputime_add(it_virt, p->it_virt_incr); + send_sig(SIGVTALRM, p, 1); + } + it_virt = cputime_sub(it_virt, cputime); + p->it_virt_value = it_virt; + } +} + +/* + * Do the virtual profiling signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void account_it_prof(struct task_struct *p, cputime_t cputime) +{ + cputime_t it_prof = p->it_prof_value; + + if (cputime_gt(it_prof, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_prof)) { + it_prof = cputime_add(it_prof, p->it_prof_incr); + send_sig(SIGPROF, p, 1); + } + it_prof = cputime_sub(it_prof, cputime); + p->it_prof_value = it_prof; + } +} + +/* + * Check if the process went over its cputime resource limit after + * some cpu time got added to utime/stime. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void check_rlimit(struct task_struct *p, cputime_t cputime) +{ + cputime_t total, tmp; + + total = cputime_add(p->utime, p->stime); + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_cur); + if (unlikely(cputime_gt(total, tmp))) { + /* Send SIGXCPU every second. */ + tmp = cputime_sub(total, cputime); + if (cputime_to_secs(tmp) < cputime_to_secs(total)) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max); + if (cputime_gt(total, tmp)) + send_sig(SIGKILL, p, 1); + } +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in user space since the last update + */ +static void ms_account_user_time(struct task_struct *p, cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->utime = cputime_add(p->utime, cputime); + + /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */ + if (likely(p->signal)) + check_rlimit(p, cputime); + account_it_virt(p, cputime); + account_it_prof(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +static void ms_account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); + + /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */ + if (likely(p->signal)) + check_rlimit(p, cputime); + account_it_prof(p, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (p != rq->idle) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); +} + +/* + * Account for involuntary wait time. + * @p: the process from which the cpu time has been stolen + * @steal: the cpu time spent in involuntary wait + */ +static void ms_account_steal_time(struct task_struct *p, cputime_t steal) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t steal64 = cputime_to_cputime64(steal); + + if (p == rq->idle) + cpustat->system = cputime64_add(cpustat->system, steal64); + else + cpustat->steal = cputime64_add(cpustat->steal, steal64); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +static void ms_scheduler_tick(void) +{ + task_t *p = current; + + if (p == rq->idle) + return; + + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(!task_queued(p))) { + set_tsk_need_resched(p); + return; + } + + /* + * SCHED_FIFO tasks never run out of timeslice. + */ + if (unlikely(p->policy == SCHED_FIFO)) + return; + + spin_lock(&rq->lock); + + if (!--p->u.mssched.time_slice) { + p->u.mssched.time_slice = task_timeslice(p); + set_tsk_need_resched(p); + requeue_task(p); + } + spin_unlock(&rq->lock); +} + +/* + * schedule() is the main scheduler function. + */ +static void __sched ms_schedule(void) +{ + long *switch_count; + task_t *prev, *next; + + struct list_head *queue; + int idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } + } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); + +need_resched_nonpreemptible: + spin_lock_irq(&rq->lock); + + if (unlikely(current->flags & PF_DEAD)) + current->state = EXIT_DEAD; + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else + deactivate_task(prev); + } + + if (unlikely(!rq->nr_running)) { + next = rq->idle; + goto switch_tasks; + } + + idx = sched_find_first_bit(rq->bitmap); + queue = rq->queue + idx; + next = list_entry(queue->next, task_t, u.mssched.run_list); + +switch_tasks: + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(next); + prev = context_switch(prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) + goto need_resched_nonpreemptible; + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +static void __sched ms_wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} + +static void ms_set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + task_rq_lock(&flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + + p->static_prio = NICE_TO_PRIO(nice); + +out_unlock: + task_rq_unlock(&flags); +} + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are all 0. + */ +static int ms_task_prio(const task_t *p) +{ + return p->u.mssched.prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +static int ms_task_nice(const task_t *p) +{ + return TASK_NICE(p); +} + +/** + * idle_cpu - is the cpu idle currently? + */ +static int ms_idle_cpu(int cpu) +{ + return rq->curr == rq->idle; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + BUG_ON(task_queued(p)); + p->policy = policy; + p->rt_priority = prio; + if (policy != SCHED_NORMAL) + p->u.mssched.prio = MAX_RT_PRIO - 1 - p->rt_priority; + else + p->u.mssched.prio = MAX_RT_PRIO; +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static int ms_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + int queued, oldprio, oldpolicy = -1; + unsigned long flags; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock; +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL) + goto out_unlock; + } + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) + goto out_unlock; + if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p, policy, &lp); + if (retval) + goto out_unlock; + /* + * To be able to change p->policy safely, the + * runqueue lock must be held. + */ + task_rq_lock(&flags); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(&flags); + goto recheck; + } + if ((queued = task_queued(p))) + deactivate_task(p); + retval = 0; + oldprio = p->u.mssched.prio; + __setscheduler(p, policy, lp.sched_priority); + if (queued) { + __activate_task(p); + /* + * Reschedule if we are currently running and + * our priority decreased, or if we are not currently running + * and our priority is higher than the current's + */ + if (task_running(p)) { + if (p->u.mssched.prio > oldprio) + resched_task(rq->curr); + } else + preempt(p); + } + task_rq_unlock(&flags); +out_unlock: + read_unlock_irq(&tasklist_lock); +out_nounlock: + return retval; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by dropping to end of the runqueue. + */ +static long ms_sys_sched_yield(void) +{ + task_t *p = current; + rq_lock(); + + set_tsk_need_resched(p); + requeue_task(current); + current->u.mssched.time_slice = task_timeslice(current); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +static void __sched ms_io_schedule(void) +{ + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +static long __sched ms_io_schedule_timeout(long timeout) +{ + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +static long +ms_sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static void __devinit ms_init_idle(task_t *idle, int cpu) +{ + unsigned long flags; + + idle->u.mssched.prio = MAX_RT_PRIO + 1; + idle->state = TASK_RUNNING; + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; + set_tsk_need_resched(idle); + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +static void __init ms_sched_init_smp(void) +{ +} + +static void __init ms_sched_init(void) +{ + int i; + + init_task.u.mssched.prio = MAX_RT_PRIO; + init_task.static_prio = MAX_RT_PRIO + 20; + INIT_LIST_HEAD(&init_task.u.mssched.run_list); + init_task.u.mssched.time_slice = HZ; + + spin_lock_init(&rq->lock); + + atomic_set(&rq->nr_iowait, 0); + for (i = 0; i <= MAX_PRIO; i++) + INIT_LIST_HEAD(&rq->queue[i]); + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO + 1)*sizeof(long)); + /* + * delimiter for bitsearch + */ + __set_bit(MAX_PRIO + 1, rq->bitmap); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, 0); +} + +static void ms_init_sched_domain_sysctl(void) +{ +} +static void ms_destroy_sched_domain_sysctl(void) +{ +} + +#ifdef CONFIG_MAGIC_SYSRQ +void ms_normalise_rt_tasks(void) +{ + struct task_struct *p; + unsigned long flags; + int queued; + + read_lock_irq(&tasklist_lock); + for_each_process (p) { + if (!rt_task(p)) + continue; + + task_rq_lock(&flags); + + if ((queued = task_queued(p))) + deactivate_task(p); + __setscheduler(p, SCHED_NORMAL, 0); + if (queued) { + __activate_task(p); + resched_task(rq->curr); + } + + task_rq_unlock(&flags); + } + read_unlock_irq(&tasklist_lock); +} +#endif + +#ifdef CONFIG_KGDB +static struct task_struct *ms_kgdb_get_idle(int this_cpu) +{ + return rq->idle; +} +#endif + +struct sched_drv ms_sched_drv = { + .task_cpu = common_task_cpu, + .set_task_cpu = common_set_task_cpu, + .init_sched_domain_sysctl = ms_init_sched_domain_sysctl, + .destroy_sched_domain_sysctl = ms_destroy_sched_domain_sysctl, + .account_steal_time = ms_account_steal_time, + .account_system_time = ms_account_system_time, + .account_user_time = ms_account_user_time, + .cpusched_name = "minisched", + .rt_task = ms_rt_task, + .wait_for_completion = ms_wait_for_completion, + .io_schedule = ms_io_schedule, + .io_schedule_timeout = ms_io_schedule_timeout, + .set_oom_timeslice = ms_set_oom_timeslice, + .nr_running = ms_nr_running, + .nr_uninterruptible = ms_nr_uninterruptible, + .nr_context_switches = ms_nr_context_switches, + .nr_iowait = ms_nr_iowait, + .idle_cpu = ms_idle_cpu, + .init_idle = ms_init_idle, + .exit = ms_sched_exit, + .fork = ms_sched_fork, + .init = ms_sched_init, + .init_smp = ms_sched_init_smp, + .schedule = ms_schedule, + .tick = ms_scheduler_tick, + .tail = ms_schedule_tail, + .setscheduler = ms_setscheduler, + .set_user_nice = ms_set_user_nice, + .rr_get_interval = ms_sys_sched_rr_get_interval, + .yield = ms_sys_sched_yield, + .task_curr = ms_task_curr, + .task_nice = ms_task_nice, + .task_prio = ms_task_prio, + .try_to_wake_up = ms_try_to_wake_up, + .wake_up_new_task = ms_wake_up_new_task, +#ifdef CONFIG_SCHEDSTATS + .show_schedstat = ms_show_schedstat, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_tasks = ms_normalise_rt_tasks, +#endif +#ifdef CONFIG_KGDB + .kgdb_get_idle = ms_kgdb_get_idle, +#endif +}; Index: linux-2.6.10-rc1-mm4-plugsched/kernel/sched.c =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/kernel/sched.c 2004-11-10 09:36:34.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/kernel/sched.c 2004-11-10 10:01:20.000000000 +1100 @@ -1,7 +1,7 @@ /* * kernel/sched.c * - * Kernel scheduler and related syscalls + * This is "ingosched"; the default cpu scheduler. * * Copyright (C) 1991-2002 Linus Torvalds * @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,15 @@ #endif /* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->prio value means higher priority. + */ + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -130,7 +140,7 @@ */ #define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + (NS_TO_JIFFIES((p)->u.ingosched.sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG) #define GRANULARITY (10 * HZ / 1000 ? : 1) @@ -151,14 +161,14 @@ (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) + ((p)->u.ingosched.prio <= (p)->static_prio - DELTA(p)) #define INTERACTIVE_SLEEP(p) \ (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) #define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) + ((p)->u.ingosched.prio < (rq)->curr->u.ingosched.prio) /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] @@ -179,7 +189,7 @@ static unsigned int task_timeslice(task_ else return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); } -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ +#define task_hot(p, now, sd) ((long long) ((now) - (p)->u.ingosched.last_ran) \ < (long long) (sd)->cache_hot_time) /* @@ -287,6 +297,11 @@ static DEFINE_PER_CPU(struct runqueue, r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static int ingo_rt_task(task_t *p) +{ + return (unlikely((p)->u.ingosched.prio < MAX_RT_PRIO)); +} + /* * Default context-switch locking: */ @@ -330,7 +345,7 @@ static inline void task_rq_unlock(runque */ #define SCHEDSTAT_VERSION 10 -static int show_schedstat(struct seq_file *seq, void *v) +static int ingo_show_schedstat(struct seq_file *seq, void *v) { int cpu; enum idle_type itype; @@ -389,32 +404,6 @@ static int show_schedstat(struct seq_fil return 0; } -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - # define schedstat_inc(rq, field) rq->field++; # define schedstat_add(rq, field, amt) rq->field += amt; #else /* !CONFIG_SCHEDSTATS */ @@ -571,18 +560,18 @@ static inline void sched_info_switch(tas static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + list_del(&p->u.ingosched.run_list); + if (list_empty(array->queue + p->u.ingosched.prio)) + __clear_bit(p->u.ingosched.prio, array->bitmap); } static void enqueue_task(struct task_struct *p, prio_array_t *array) { sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); + list_add_tail(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); + __set_bit(p->u.ingosched.prio, array->bitmap); array->nr_active++; - p->array = array; + p->u.ingosched.array = array; } /* @@ -591,7 +580,7 @@ static void enqueue_task(struct task_str */ static void requeue_task(struct task_struct *p, prio_array_t *array) { - list_move_tail(&p->run_list, array->queue + p->prio); + list_move_tail(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); } /* @@ -601,10 +590,15 @@ static void requeue_task(struct task_str */ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); + list_add(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); + __set_bit(p->u.ingosched.prio, array->bitmap); array->nr_active++; - p->array = array; + p->u.ingosched.array = array; +} + +static void ingo_set_oom_timeslice(task_t *p) +{ + p->u.ingosched.time_slice = HZ; } /* @@ -626,7 +620,7 @@ static int effective_prio(task_t *p) int bonus, prio; if (rt_task(p)) - return p->prio; + return p->u.ingosched.prio; bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; @@ -658,7 +652,7 @@ static inline void __activate_idle_task( static void recalc_task_prio(task_t *p, unsigned long long now) { - unsigned long long __sleep_time = now - p->timestamp; + unsigned long long __sleep_time = now - p->u.ingosched.timestamp; unsigned long sleep_time; if (__sleep_time > NS_MAX_SLEEP_AVG) @@ -673,9 +667,9 @@ static void recalc_task_prio(task_t *p, * prevent them suddenly becoming cpu hogs and starving * other processes. */ - if (p->mm && p->activated != -1 && + if (p->mm && p->u.ingosched.activated != -1 && sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + p->u.ingosched.sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - DEF_TIMESLICE); } else { /* @@ -689,12 +683,12 @@ static void recalc_task_prio(task_t *p, * limited in their sleep_avg rise as they * are likely to be waiting on I/O */ - if (p->activated == -1 && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + if (p->u.ingosched.activated == -1 && p->mm) { + if (p->u.ingosched.sleep_avg >= INTERACTIVE_SLEEP(p)) sleep_time = 0; - else if (p->sleep_avg + sleep_time >= + else if (p->u.ingosched.sleep_avg + sleep_time >= INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); + p->u.ingosched.sleep_avg = INTERACTIVE_SLEEP(p); sleep_time = 0; } } @@ -707,14 +701,14 @@ static void recalc_task_prio(task_t *p, * task spends sleeping, the higher the average gets - * and the higher the priority boost gets as well. */ - p->sleep_avg += sleep_time; + p->u.ingosched.sleep_avg += sleep_time; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; + if (p->u.ingosched.sleep_avg > NS_MAX_SLEEP_AVG) + p->u.ingosched.sleep_avg = NS_MAX_SLEEP_AVG; } } - p->prio = effective_prio(p); + p->u.ingosched.prio = effective_prio(p); } /* @@ -743,7 +737,7 @@ static void activate_task(task_t *p, run * This checks to make sure it's not an uninterruptible task * that is now waking up. */ - if (!p->activated) { + if (!p->u.ingosched.activated) { /* * Tasks which were woken up by interrupts (ie. hw events) * are most likely of interactive nature. So we give them @@ -752,16 +746,16 @@ static void activate_task(task_t *p, run * on a CPU, first time around: */ if (in_interrupt()) - p->activated = 2; + p->u.ingosched.activated = 2; else { /* * Normal first-time wakeups get a credit too for * on-runqueue time, but it will be weighted down: */ - p->activated = 1; + p->u.ingosched.activated = 1; } } - p->timestamp = now; + p->u.ingosched.timestamp = now; __activate_task(p, rq); } @@ -774,8 +768,8 @@ static void deactivate_task(struct task_ rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(p, p->array); - p->array = NULL; + dequeue_task(p, p->u.ingosched.array); + p->u.ingosched.array = NULL; } /* @@ -811,7 +805,7 @@ static inline void resched_task(task_t * * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ -inline int task_curr(const task_t *p) +static int ingo_task_curr(const task_t *p) { return cpu_curr(task_cpu(p)) == p; } @@ -848,7 +842,7 @@ static int migrate_task(task_t *p, int d * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->u.ingosched.array && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -870,7 +864,7 @@ static int migrate_task(task_t *p, int d * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +static void ingo_wait_task_inactive(task_t * p) { unsigned long flags; runqueue_t *rq; @@ -879,7 +873,7 @@ void wait_task_inactive(task_t * p) repeat: rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array)) { + if (unlikely(p->u.ingosched.array)) { /* If it's preempted, we yield. It could be a while. */ preempted = !task_running(rq, p); task_rq_unlock(rq, &flags); @@ -891,24 +885,6 @@ repeat: task_rq_unlock(rq, &flags); } -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - */ -void kick_process(task_t *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} - /* * Return a low guess at the load of a migration-source cpu. * @@ -988,7 +964,7 @@ static inline int wake_idle(int cpu, tas * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int ingo_try_to_wake_up(task_t * p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1006,7 +982,7 @@ static int try_to_wake_up(task_t * p, un if (!(old_state & state)) goto out; - if (p->array) + if (p->u.ingosched.array) goto out_running; cpu = task_cpu(p); @@ -1085,7 +1061,7 @@ out_set_cpu: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->u.ingosched.array) goto out_running; this_cpu = smp_processor_id(); @@ -1100,7 +1076,7 @@ out_activate: * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. */ - p->activated = -1; + p->u.ingosched.activated = -1; } /* @@ -1126,19 +1102,6 @@ out: return success; } -int fastcall wake_up_process(task_t * p) -{ - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); -} - -EXPORT_SYMBOL(wake_up_process); - -int fastcall wake_up_state(task_t *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - #ifdef CONFIG_SMP static int find_idlest_cpu(struct task_struct *p, int this_cpu, struct sched_domain *sd); @@ -1148,7 +1111,7 @@ static int find_idlest_cpu(struct task_s * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p) +static void ingo_sched_fork(task_t *p) { /* * We mark the process as running here, but have not actually @@ -1157,8 +1120,8 @@ void fastcall sched_fork(task_t *p) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; - INIT_LIST_HEAD(&p->run_list); - p->array = NULL; + INIT_LIST_HEAD(&p->u.ingosched.run_list); + p->u.ingosched.array = NULL; spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -1178,21 +1141,21 @@ void fastcall sched_fork(task_t *p) * resulting in more scheduling fairness. */ local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + p->u.ingosched.time_slice = (current->u.ingosched.time_slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { + p->u.ingosched.first_time_slice = 1; + current->u.ingosched.time_slice >>= 1; + p->u.ingosched.timestamp = sched_clock(); + if (unlikely(!current->u.ingosched.time_slice)) { /* * This case is rare, it happens when the parent has only * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ - current->time_slice = 1; + current->u.ingosched.time_slice = 1; preempt_disable(); scheduler_tick(); local_irq_enable(); @@ -1208,7 +1171,7 @@ void fastcall sched_fork(task_t *p) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +static void ingo_wake_up_new_task(task_t * p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1227,10 +1190,10 @@ void fastcall wake_up_new_task(task_t * * from forking tasks that are max-interactive. The parent * (current) is done further down, under its lock. */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + p->u.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->prio = effective_prio(p); + p->u.ingosched.prio = effective_prio(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { @@ -1239,13 +1202,13 @@ void fastcall wake_up_new_task(task_t * * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) + if (unlikely(!current->u.ingosched.array)) __activate_task(p, rq); else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; + p->u.ingosched.prio = current->u.ingosched.prio; + list_add_tail(&p->u.ingosched.run_list, ¤t->u.ingosched.run_list); + p->u.ingosched.array = current->u.ingosched.array; + p->u.ingosched.array->nr_active++; rq->nr_running++; } set_need_resched(); @@ -1266,7 +1229,7 @@ void fastcall wake_up_new_task(task_t * * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + p->u.ingosched.timestamp = (p->u.ingosched.timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) @@ -1275,12 +1238,12 @@ void fastcall wake_up_new_task(task_t * schedstat_inc(rq, wunt_moved); /* * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: + * parent runqueue to update the parent's ->u.ingosched.sleep_avg: */ task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + current->u.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); task_rq_unlock(this_rq, &flags); } @@ -1294,7 +1257,7 @@ void fastcall wake_up_new_task(task_t * * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +static void ingo_sched_exit(task_t * p) { unsigned long flags; runqueue_t *rq; @@ -1304,14 +1267,14 @@ void fastcall sched_exit(task_t * p) * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + if (p->u.ingosched.first_time_slice) { + p->parent->u.ingosched.time_slice += p->u.ingosched.time_slice; + if (unlikely(p->parent->u.ingosched.time_slice > task_timeslice(p))) + p->parent->u.ingosched.time_slice = task_timeslice(p); + } + if (p->u.ingosched.sleep_avg < p->parent->u.ingosched.sleep_avg) + p->parent->u.ingosched.sleep_avg = p->parent->u.ingosched.sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->u.ingosched.sleep_avg / (EXIT_WEIGHT + 1); task_rq_unlock(rq, &flags); } @@ -1361,7 +1324,7 @@ static void finish_task_switch(task_t *p * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(task_t *prev) +static void ingo_schedule_tail(task_t *prev) __releases(rq->lock) { finish_task_switch(prev); @@ -1406,7 +1369,7 @@ task_t * context_switch(runqueue_t *rq, * threads, current number of uninterruptible-sleeping threads, total * number of context switches performed since bootup. */ -unsigned long nr_running(void) +static unsigned long ingo_nr_running(void) { unsigned long i, sum = 0; @@ -1416,7 +1379,7 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) +static unsigned long ingo_nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -1426,7 +1389,7 @@ unsigned long nr_uninterruptible(void) return sum; } -unsigned long long nr_context_switches(void) +static unsigned long long ingo_nr_context_switches(void) { unsigned long long i, sum = 0; @@ -1436,7 +1399,7 @@ unsigned long long nr_context_switches(v return sum; } -unsigned long nr_iowait(void) +static unsigned long ingo_nr_iowait(void) { unsigned long i, sum = 0; @@ -1592,7 +1555,7 @@ out: * execve() is a valuable balancing opportunity, because at this point * the task has the smallest effective memory and cache footprint. */ -void sched_exec(void) +static void ingo_sched_exec(void) { struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); @@ -1633,7 +1596,7 @@ void pull_task(runqueue_t *src_rq, prio_ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + p->u.ingosched.timestamp = (p->u.ingosched.timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of MAX_PRIO, for this test @@ -1729,7 +1692,7 @@ skip_bitmap: head = array->queue + idx; curr = head->prev; skip_queue: - tmp = list_entry(curr, task_t, run_list); + tmp = list_entry(curr, task_t, u.ingosched.run_list); curr = curr->prev; @@ -2235,10 +2198,6 @@ static inline int wake_priority_sleeper( return ret; } -DEFINE_PER_CPU(struct kernel_stat, kstat); - -EXPORT_PER_CPU_SYMBOL(kstat); - /* * We place interactive tasks back into the active array, if possible. * @@ -2325,7 +2284,7 @@ static void check_rlimit(struct task_str * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in user space since the last update */ -void account_user_time(struct task_struct *p, cputime_t cputime) +static void ingo_account_user_time(struct task_struct *p, cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; @@ -2352,7 +2311,7 @@ void account_user_time(struct task_struc * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update */ -void account_system_time(struct task_struct *p, int hardirq_offset, +static void ingo_account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -2385,7 +2344,7 @@ void account_system_time(struct task_str * @p: the process from which the cpu time has been stolen * @steal: the cpu time spent in involuntary wait */ -void account_steal_time(struct task_struct *p, cputime_t steal) +static void ingo_account_steal_time(struct task_struct *p, cputime_t steal) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t steal64 = cputime_to_cputime64(steal); @@ -2404,7 +2363,7 @@ void account_steal_time(struct task_stru * It also gets called by the fork code, when changing the parent's * timeslices. */ -void scheduler_tick(void) +static void ingo_scheduler_tick(void) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); @@ -2420,7 +2379,7 @@ void scheduler_tick(void) } /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->u.ingosched.array != rq->active) { set_tsk_need_resched(p); goto out; } @@ -2437,9 +2396,9 @@ void scheduler_tick(void) * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + if ((p->policy == SCHED_RR) && !--p->u.ingosched.time_slice) { + p->u.ingosched.time_slice = task_timeslice(p); + p->u.ingosched.first_time_slice = 0; set_tsk_need_resched(p); /* put it at the end of the queue: */ @@ -2447,12 +2406,12 @@ void scheduler_tick(void) } goto out_unlock; } - if (!--p->time_slice) { + if (!--p->u.ingosched.time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + p->u.ingosched.prio = effective_prio(p); + p->u.ingosched.time_slice = task_timeslice(p); + p->u.ingosched.first_time_slice = 0; if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; @@ -2480,9 +2439,9 @@ void scheduler_tick(void) * delta range with at least TIMESLICE_GRANULARITY to requeue. */ if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { + p->u.ingosched.time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->u.ingosched.time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->u.ingosched.array == rq->active)) { requeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2573,7 +2532,7 @@ static inline int dependent_sleeper(int BUG_ON(!array->nr_active); p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, - task_t, run_list); + task_t, u.ingosched.run_list); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); @@ -2587,7 +2546,7 @@ static inline int dependent_sleeper(int * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + if (((smt_curr->u.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) ret = 1; @@ -2597,7 +2556,7 @@ static inline int dependent_sleeper(int * or wake it up if it has been put to sleep for priority * reasons. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + if ((((p->u.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(smt_curr) || rt_task(p)) && smt_curr->mm && p->mm && !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) @@ -2619,42 +2578,10 @@ static inline int dependent_sleeper(int } #endif -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) - -void fastcall add_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON(((int)preempt_count() < 0)); - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); -} -EXPORT_SYMBOL(add_preempt_count); - -void fastcall sub_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON(val > preempt_count()); - /* - * Is the spinlock portion underflowing? - */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched ingo_schedule(void) { long *switch_count; task_t *prev, *next; @@ -2698,8 +2625,8 @@ need_resched_nonpreemptible: schedstat_inc(rq, sched_cnt); now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; + if (likely(now - prev->u.ingosched.timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->u.ingosched.timestamp; else run_time = NS_MAX_SLEEP_AVG; @@ -2773,20 +2700,20 @@ go_idle: idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); + next = list_entry(queue->next, task_t, u.ingosched.run_list); - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; + if (!rt_task(next) && next->u.ingosched.activated > 0) { + unsigned long long delta = now - next->u.ingosched.timestamp; - if (next->activated == 1) + if (next->u.ingosched.activated == 1) delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - array = next->array; + array = next->u.ingosched.array; dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); + recalc_task_prio(next, next->u.ingosched.timestamp + delta); enqueue_task(next, array); } - next->activated = 0; + next->u.ingosched.activated = 0; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); @@ -2794,14 +2721,14 @@ switch_tasks: clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; + prev->u.ingosched.sleep_avg -= run_time; + if ((long)prev->u.ingosched.sleep_avg <= 0) + prev->u.ingosched.sleep_avg = 0; + prev->u.ingosched.timestamp = prev->u.ingosched.last_ran = now; sched_info_switch(prev, next); if (likely(prev != next)) { - next->timestamp = now; + next->u.ingosched.timestamp = now; rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -2822,169 +2749,7 @@ switch_tasks: goto need_resched; } -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_PREEMPT -/* - * this is is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage void __sched preempt_schedule(void) -{ - struct thread_info *ti = current_thread_info(); -#ifdef CONFIG_PREEMPT_BKL - struct task_struct *task = current; - int saved_lock_depth; -#endif - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (unlikely(ti->preempt_count || irqs_disabled())) - return; - -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ -#ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; -#endif - schedule(); -#ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; -#endif - sub_preempt_count(PREEMPT_ACTIVE); - - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; -} - -EXPORT_SYMBOL(preempt_schedule); -#endif /* CONFIG_PREEMPT */ - -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) -{ - task_t *p = curr->task; - return try_to_wake_up(p, mode, sync); -} - -EXPORT_SYMBOL(default_wake_function); - -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) -{ - struct list_head *tmp, *next; - - list_for_each_safe(tmp, next, &q->task_list) { - wait_queue_t *curr; - unsigned flags; - curr = list_entry(tmp, wait_queue_t, task_list); - flags = curr->flags; - if (curr->func(curr, mode, sync, key) && - (flags & WQ_FLAG_EXCLUSIVE) && - !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - */ -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} - -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) -{ - __wake_up_common(q, mode, 1, 0, NULL); -} - -/** - * __wake_up - sync- wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - unsigned long flags; - int sync = 1; - - if (unlikely(!q)) - return; - - if (unlikely(!nr_exclusive)) - sync = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -void fastcall complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -void fastcall complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -void fastcall __sched wait_for_completion(struct completion *x) +static void __sched ingo_wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -3004,80 +2769,8 @@ void fastcall __sched wait_for_completio x->done--; spin_unlock_irq(&x->wait.lock); } -EXPORT_SYMBOL(wait_for_completion); - -#define SLEEP_ON_VAR \ - unsigned long flags; \ - wait_queue_t wait; \ - init_waitqueue_entry(&wait, current); - -#define SLEEP_ON_HEAD \ - spin_lock_irqsave(&q->lock,flags); \ - __add_wait_queue(q, &wait); \ - spin_unlock(&q->lock); -#define SLEEP_ON_TAIL \ - spin_lock_irq(&q->lock); \ - __remove_wait_queue(q, &wait); \ - spin_unlock_irqrestore(&q->lock, flags); - -void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} - -EXPORT_SYMBOL(interruptible_sleep_on); - -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; -} - -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void fastcall __sched sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} - -EXPORT_SYMBOL(sleep_on); - -long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; -} - -EXPORT_SYMBOL(sleep_on_timeout); - -void set_user_nice(task_t *p, long nice) +static void ingo_set_user_nice(task_t *p, long nice) { unsigned long flags; prio_array_t *array; @@ -3101,15 +2794,15 @@ void set_user_nice(task_t *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; + array = p->u.ingosched.array; if (array) dequeue_task(p, array); - old_prio = p->prio; + old_prio = p->u.ingosched.prio; new_prio = NICE_TO_PRIO(nice); delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; + p->u.ingosched.prio += delta; if (array) { enqueue_task(p, array); @@ -3124,59 +2817,13 @@ out_unlock: task_rq_unlock(rq, &flags); } -EXPORT_SYMBOL(set_user_nice); - #ifdef CONFIG_KGDB -struct task_struct *kgdb_get_idle(int this_cpu) +static struct task_struct *ingo_kgdb_get_idle(int this_cpu) { return cpu_rq(this_cpu)->idle; } #endif -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -asmlinkage long sys_nice(int increment) -{ - int retval; - long nice; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - if (increment < 0) { - if (!capable(CAP_SYS_NICE)) - return -EPERM; - if (increment < -40) - increment = -40; - } - if (increment > 40) - increment = 40; - - nice = PRIO_TO_NICE(current->static_prio) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - /** * task_prio - return the priority value of a given task. * @p: the task in question. @@ -3185,16 +2832,16 @@ asmlinkage long sys_nice(int increment) * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ -int task_prio(const task_t *p) +static int ingo_task_prio(const task_t *p) { - return p->prio - MAX_RT_PRIO; + return p->u.ingosched.prio - MAX_RT_PRIO; } /** * task_nice - return the nice value of a given task. * @p: the task in question. */ -int task_nice(const task_t *p) +static int ingo_task_nice(const task_t *p) { return TASK_NICE(p); } @@ -3203,38 +2850,27 @@ int task_nice(const task_t *p) * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. */ -int idle_cpu(int cpu) +static int ingo_idle_cpu(int cpu) { return cpu_curr(cpu) == cpu_rq(cpu)->idle; } -EXPORT_SYMBOL_GPL(idle_cpu); - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - */ -static inline task_t *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_pid(pid) : current; -} - /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(p->u.ingosched.array); p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + p->u.ingosched.prio = MAX_USER_RT_PRIO-1 - p->rt_priority; else - p->prio = p->static_prio; + p->u.ingosched.prio = p->static_prio; } /* * setscheduler - change the scheduling policy and/or RT priority of a thread. */ -static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int ingo_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lp; int retval = -EINVAL; @@ -3303,11 +2939,11 @@ recheck: task_rq_unlock(rq, &flags); goto recheck; } - array = p->array; + array = p->u.ingosched.array; if (array) deactivate_task(p, task_rq(p)); retval = 0; - oldprio = p->prio; + oldprio = p->u.ingosched.prio; __setscheduler(p, policy, lp.sched_priority); if (array) { __activate_task(p, task_rq(p)); @@ -3317,7 +2953,7 @@ recheck: * this runqueue and our priority is higher than the current's */ if (task_running(rq, p)) { - if (p->prio > oldprio) + if (p->u.ingosched.prio > oldprio) resched_task(rq->curr); } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -3330,238 +2966,16 @@ out_nounlock: } /** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, - struct sched_param __user *param) -{ - return setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) -{ - return setscheduler(pid, -1, param); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - */ -asmlinkage long sys_sched_getscheduler(pid_t pid) -{ - int retval = -EINVAL; - task_t *p; - - if (pid < 0) - goto out_nounlock; - - retval = -ESRCH; - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy; - } - read_unlock(&tasklist_lock); - -out_nounlock: - return retval; -} - -/** - * sys_sched_getscheduler - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - */ -asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) -{ - struct sched_param lp; - int retval = -EINVAL; - task_t *p; - - if (!param || pid < 0) - goto out_nounlock; - - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - -out_nounlock: - return retval; - -out_unlock: - read_unlock(&tasklist_lock); - return retval; -} - -long sched_setaffinity(pid_t pid, cpumask_t new_mask) -{ - task_t *p; - int retval; - cpumask_t cpus_allowed; - - lock_cpu_hotplug(); - read_lock(&tasklist_lock); - - p = find_process_by_pid(pid); - if (!p) { - read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); - return -ESRCH; - } - - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ - get_task_struct(p); - read_unlock(&tasklist_lock); - - retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) - goto out_unlock; - - cpus_allowed = cpuset_cpus_allowed(p); - cpus_and(new_mask, new_mask, cpus_allowed); - retval = set_cpus_allowed(p, new_mask); - -out_unlock: - put_task_struct(p); - unlock_cpu_hotplug(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) -{ - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - */ -asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) -{ - cpumask_t new_mask; - int retval; - - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; - - return sched_setaffinity(pid, new_mask); -} - -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map = CPU_MASK_ALL; -cpumask_t cpu_possible_map = CPU_MASK_ALL; -#endif - -long sched_getaffinity(pid_t pid, cpumask_t *mask) -{ - int retval; - task_t *p; - - lock_cpu_hotplug(); - read_lock(&tasklist_lock); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = 0; - cpus_and(*mask, p->cpus_allowed, cpu_possible_map); - -out_unlock: - read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); - if (retval) - return retval; - - return 0; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - */ -asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) -{ - int ret; - cpumask_t mask; - - if (len < sizeof(cpumask_t)) - return -EINVAL; - - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; - - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; - - return sizeof(cpumask_t); -} - -/** * sys_sched_yield - yield the current processor to other threads. * * this function yields the current CPU by moving the calling thread * to the expired array. If there are no other threads running on this * CPU then this function will return. */ -asmlinkage long sys_sched_yield(void) +static long ingo_sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; + prio_array_t *array = current->u.ingosched.array; prio_array_t *target = rq->expired; schedstat_inc(rq, yld_cnt); @@ -3575,7 +2989,7 @@ asmlinkage long sys_sched_yield(void) if (rt_task(current)) target = rq->active; - if (current->array->nr_active == 1) { + if (current->u.ingosched.array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); @@ -3604,86 +3018,6 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline void __cond_resched(void) -{ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); -} - -int __sched cond_resched(void) -{ - if (need_resched()) { - __cond_resched(); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched); - -/* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int cond_resched_lock(spinlock_t * lock) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) - if (lock->break_lock) { - lock->break_lock = 0; - spin_unlock(lock); - cpu_relax(); - spin_lock(lock); - } -#endif - if (need_resched()) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); - spin_lock(lock); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched_lock); - -int __sched cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (need_resched()) { - __local_bh_enable(); - __cond_resched(); - local_bh_disable(); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched_softirq); - - -/** - * yield - yield the current processor to other threads. - * - * this is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} - -EXPORT_SYMBOL(yield); - /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. @@ -3691,7 +3025,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void __sched io_schedule(void) +static void __sched ingo_io_schedule(void) { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); @@ -3700,9 +3034,7 @@ void __sched io_schedule(void) atomic_dec(&rq->nr_iowait); } -EXPORT_SYMBOL(io_schedule); - -long __sched io_schedule_timeout(long timeout) +static long __sched ingo_io_schedule_timeout(long timeout) { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); long ret; @@ -3714,51 +3046,6 @@ long __sched io_schedule_timeout(long ti } /** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. - */ -asmlinkage long sys_sched_get_priority_max(int policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. - */ -asmlinkage long sys_sched_get_priority_min(int policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - ret = 0; - } - return ret; -} - -/** * sys_sched_rr_get_interval - return the default timeslice of a process. * @pid: pid of the process. * @interval: userspace pointer to the timeslice value. @@ -3766,8 +3053,8 @@ asmlinkage long sys_sched_get_priority_m * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -asmlinkage -long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +static long +ingo_sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { int retval = -EINVAL; struct timespec t; @@ -3797,112 +3084,14 @@ out_unlock: return retval; } -static inline struct task_struct *eldest_child(struct task_struct *p) -{ - if (list_empty(&p->children)) return NULL; - return list_entry(p->children.next,struct task_struct,sibling); -} - -static inline struct task_struct *older_sibling(struct task_struct *p) -{ - if (p->sibling.prev==&p->parent->children) return NULL; - return list_entry(p->sibling.prev,struct task_struct,sibling); -} - -static inline struct task_struct *younger_sibling(struct task_struct *p) -{ - if (p->sibling.next==&p->parent->children) return NULL; - return list_entry(p->sibling.next,struct task_struct,sibling); -} - -static void show_task(task_t * p) -{ - task_t *relative; - unsigned state; - unsigned long free = 0; - static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; - - printk("%-13.13s ", p->comm); - state = p->state ? __ffs(p->state) + 1 : 0; - if (state < ARRAY_SIZE(stat_nam)) - printk(stat_nam[state]); - else - printk("?"); -#if (BITS_PER_LONG == 32) - if (state == TASK_RUNNING) - printk(" running "); - else - printk(" %08lX ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(" running task "); - else - printk(" %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long * n = (unsigned long *) (p->thread_info+1); - while (!*n) - n++; - free = (unsigned long) n - (unsigned long)(p->thread_info+1); - } -#endif - printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); - if ((relative = eldest_child(p))) - printk("%5d ", relative->pid); - else - printk(" "); - if ((relative = younger_sibling(p))) - printk("%7d", relative->pid); - else - printk(" "); - if ((relative = older_sibling(p))) - printk(" %5d", relative->pid); - else - printk(" "); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); - - if (state != TASK_RUNNING) - show_stack(p, NULL); -} - -void show_state(void) -{ - task_t *g, *p; - -#if (BITS_PER_LONG == 32) - printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); -#else - printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); -#endif - read_lock(&tasklist_lock); - do_each_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: - */ - touch_nmi_watchdog(); - show_task(p); - } while_each_thread(g, p); - - read_unlock(&tasklist_lock); -} - -void __devinit init_idle(task_t *idle, int cpu) +static void __devinit ingo_init_idle(task_t *idle, int cpu) { runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; - idle->array = NULL; - idle->prio = MAX_PRIO; + idle->u.ingosched.sleep_avg = 0; + idle->u.ingosched.array = NULL; + idle->u.ingosched.prio = MAX_PRIO; idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); @@ -3919,15 +3108,6 @@ void __devinit init_idle(task_t *idle, i #endif } -/* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. - */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; - #ifdef CONFIG_SMP /* * This is how migration works: @@ -3954,7 +3134,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(task_t *p, cpumask_t new_mask) +static int ingo_set_cpus_allowed(task_t *p, cpumask_t new_mask) { unsigned long flags; int ret = 0; @@ -3987,8 +3167,6 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); - /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() @@ -4017,15 +3195,16 @@ static void __migrate_task(struct task_s goto out; set_task_cpu(p, dest_cpu); - if (p->array) { + if (p->u.ingosched.array) { /* * Sync timestamp with rq_dest's before activating. * The same thing could be achieved by doing this step * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->u.ingosched.timestamp = p->u.ingosched.timestamp - + rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -4165,7 +3344,7 @@ static void migrate_live_tasks(int src_c * It does so by boosting its priority to highest possible and adding it to * the _front_ of runqueue. Used by CPU offline code. */ -void sched_idle_next(void) +static void ingo_sched_idle_next(void) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); @@ -4223,7 +3402,7 @@ static void migrate_dead_tasks(unsigned while (!list_empty(list)) migrate_dead(dead_cpu, list_entry(list->next, task_t, - run_list)); + u.ingosched.run_list)); } } } @@ -4306,7 +3485,7 @@ static struct notifier_block __devinitda .priority = 10 }; -int __init migration_init(void) +static int __init ingo_migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); /* Start one for boot CPU. */ @@ -4322,7 +3501,7 @@ int __init migration_init(void) * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +static void __devinit ingo_cpu_attach_domain(struct sched_domain *sd, int cpu) { migration_req_t req; unsigned long flags; @@ -4349,9 +3528,6 @@ void __devinit cpu_attach_domain(struct } } -/* cpus with isolated domains */ -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { @@ -4366,52 +3542,6 @@ static int __init isolated_cpu_setup(cha __setup ("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -void __devinit init_sched_build_groups(struct sched_group groups[], - cpumask_t span, int (*group_fn)(int cpu)) -{ - struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; - int i; - - for_each_cpu_mask(i, span) { - int group = group_fn(i); - struct sched_group *sg = &groups[group]; - int j; - - if (cpu_isset(i, covered)) - continue; - - sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; - - for_each_cpu_mask(j, span) { - if (group_fn(j) != group) - continue; - - cpu_set(j, covered); - cpu_set(j, sg->cpumask); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - - #ifdef ARCH_HAS_SCHED_DOMAIN extern void __devinit arch_init_sched_domains(void); extern void __devinit arch_destroy_sched_domains(void); @@ -4740,7 +3870,7 @@ static int update_sched_domains(struct n } #endif -void __init sched_init_smp(void) +static void __init ingo_sched_init_smp(void) { lock_cpu_hotplug(); arch_init_sched_domains(); @@ -4750,25 +3880,21 @@ void __init sched_init_smp(void) hotcpu_notifier(update_sched_domains, 0); } #else -void __init sched_init_smp(void) +static void __init ingo_sched_init_smp(void) { } #endif /* CONFIG_SMP */ -int in_sched_functions(unsigned long addr) -{ - /* Linker adds these: start and end of __sched functions */ - extern char __sched_text_start[], __sched_text_end[]; - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -void __init sched_init(void) +static void __init ingo_sched_init(void) { runqueue_t *rq; int i, j, k; + init_task.u.ingosched.prio = MAX_PRIO - 20; + init_task.static_prio = MAX_PRIO - 20; + INIT_LIST_HEAD(&init_task.u.ingosched.run_list); + init_task.u.ingosched.time_slice = HZ; + for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; @@ -4814,28 +3940,6 @@ void __init sched_init(void) init_idle(current, smp_processor_id()); } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) -{ -#if defined(in_atomic) - static unsigned long prev_jiffy; /* ratelimiting */ - - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "Debug: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - dump_stack(); - } -#endif -} -EXPORT_SYMBOL(__might_sleep); -#endif - #if defined(CONFIG_DEBUG_KERNEL)&&defined(CONFIG_SYSCTL)&&defined(CONFIG_SMP) static struct ctl_table sd_ctl_dir[] = { {1, "sched_domain", NULL, 0, 0755, NULL, }, @@ -4925,7 +4029,7 @@ static ctl_table *sd_alloc_ctl_cpu_table } static struct ctl_table_header *sd_sysctl_header; -void init_sched_domain_sysctl() +void ingo_init_sched_domain_sysctl(void) { int i, cpu_num = num_online_cpus(); char buf[32]; @@ -4943,7 +4047,7 @@ void init_sched_domain_sysctl() sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0); } -void destroy_sched_domain_sysctl() +static void ingo_destroy_sched_domain_sysctl(void) { int cpu, cpu_num = num_online_cpus(); struct sched_domain *sd; @@ -4965,16 +4069,16 @@ void destroy_sched_domain_sysctl() kfree(root); } #else -void init_sched_domain_sysctl() +static void ingo_init_sched_domain_sysctl(void) { } -void destroy_sched_domain_sysctl() +static void ingo_destroy_sched_domain_sysctl(void) { } #endif #ifdef CONFIG_MAGIC_SYSRQ -void normalize_rt_tasks(void) +void ingo_normalize_rt_tasks(void) { struct task_struct *p; prio_array_t *array; @@ -4988,7 +4092,7 @@ void normalize_rt_tasks(void) rq = task_rq_lock(p, &flags); - array = p->array; + array = p->u.ingosched.array; if (array) deactivate_task(p, task_rq(p)); __setscheduler(p, SCHED_NORMAL, 0); @@ -5003,3 +4107,60 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +struct sched_drv ingo_sched_drv = { + .task_cpu = common_task_cpu, + .set_task_cpu = common_set_task_cpu, + .init_sched_domain_sysctl = ingo_init_sched_domain_sysctl, + .destroy_sched_domain_sysctl = ingo_destroy_sched_domain_sysctl, + .account_steal_time = ingo_account_steal_time, + .account_system_time = ingo_account_system_time, + .account_user_time = ingo_account_user_time, + .cpusched_name = "ingosched", + .rt_task = ingo_rt_task, + .wait_for_completion = ingo_wait_for_completion, + .io_schedule = ingo_io_schedule, + .io_schedule_timeout = ingo_io_schedule_timeout, + .set_oom_timeslice = ingo_set_oom_timeslice, + .nr_running = ingo_nr_running, + .nr_uninterruptible = ingo_nr_uninterruptible, + .nr_context_switches = ingo_nr_context_switches, + .nr_iowait = ingo_nr_iowait, + .idle_cpu = ingo_idle_cpu, + .init_idle = ingo_init_idle, + .exit = ingo_sched_exit, + .fork = ingo_sched_fork, + .init = ingo_sched_init, + .init_smp = ingo_sched_init_smp, + .schedule = ingo_schedule, + .tick = ingo_scheduler_tick, + .tail = ingo_schedule_tail, + .setscheduler = ingo_setscheduler, + .set_user_nice = ingo_set_user_nice, + .rr_get_interval = ingo_sys_sched_rr_get_interval, + .yield = ingo_sys_sched_yield, + .task_curr = ingo_task_curr, + .task_nice = ingo_task_nice, + .task_prio = ingo_task_prio, + .try_to_wake_up = ingo_try_to_wake_up, + .wake_up_new_task = ingo_wake_up_new_task, +#ifdef CONFIG_SMP + .migration_init = ingo_migration_init, + .exec = ingo_sched_exec, + .set_cpus_allowed = ingo_set_cpus_allowed, + .wait_task_inactive = ingo_wait_task_inactive, + .cpu_attach_domain = ingo_cpu_attach_domain, +#ifdef CONFIG_HOTPLUG_CPU + .sched_idle_next = ingo_sched_idle_next, +#endif +#endif +#ifdef CONFIG_SCHEDSTATS + .show_schedstat = ingo_show_schedstat, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_tasks = ingo_normalize_rt_tasks, +#endif +#ifdef CONFIG_KGDB + .kgdb_get_idle = ingo_kgdb_get_idle, +#endif +}; Index: linux-2.6.10-rc1-mm4-plugsched/kernel/scheduler.c =================================================================== --- linux-2.6.10-rc1-mm4-plugsched.orig/kernel/scheduler.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm4-plugsched/kernel/scheduler.c 2004-11-10 10:01:22.000000000 +1100 @@ -0,0 +1,1267 @@ +/* + * kernel/scheduler.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * Modular cpu scheduler infrastructure by Con Kolivas based on + * work by William Lee Irwin III. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DEFINE_PER_CPU(struct kernel_stat, kstat); +EXPORT_PER_CPU_SYMBOL(kstat); + +unsigned int task_cpu(const struct task_struct *p); + +void set_task_cpu(struct task_struct *p, unsigned int cpu); + +#ifdef CONFIG_SMP +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + */ +void kick_process(task_t *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Wrappers for p->thread_info->cpu access. No-op on UP. + */ +unsigned int common_task_cpu(const struct task_struct *p) +{ + return p->thread_info->cpu; +} + +void common_set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->thread_info->cpu = cpu; +} + +#else + +unsigned int common_task_cpu(const struct task_struct *p) +{ + return 0; +} + +void common_set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PREEMPT +#ifdef CONFIG_DEBUG_PREEMPT + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + schedule(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void fastcall __sched sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "Debug: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + dump_stack(); + } +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +asmlinkage long sys_nice(int increment) +{ + int retval; + long nice; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < 0) { + if (!capable(CAP_SYS_NICE)) + return -EPERM; + if (increment < -40) + increment = -40; + } + if (increment > 40) + increment = 40; + + nice = task_nice(current) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +int setscheduler(pid_t pid, int policy, struct sched_param __user *param); + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + return setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +{ + return setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +asmlinkage long sys_sched_getscheduler(pid_t pid) +{ + int retval = -EINVAL; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +long sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + task_t *p; + int retval; + cpumask_t cpus_allowed; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + cpus_allowed = cpuset_cpus_allowed(p); + cpus_and(new_mask, new_mask, cpus_allowed); + retval = set_cpus_allowed(p, new_mask); + +out_unlock: + put_task_struct(p); + unlock_cpu_hotplug(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + cpumask_t *new_mask) +{ + if (len < sizeof(cpumask_t)) { + memset(new_mask, 0, sizeof(cpumask_t)); + } else if (len > sizeof(cpumask_t)) { + len = sizeof(cpumask_t); + } + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + + retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (retval) + return retval; + + return sched_setaffinity(pid, new_mask); +} + +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ + +cpumask_t cpu_present_map; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP +cpumask_t cpu_online_map = CPU_MASK_ALL; +cpumask_t cpu_possible_map = CPU_MASK_ALL; +#endif + +long sched_getaffinity(pid_t pid, cpumask_t *mask) +{ + int retval; + task_t *p; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + cpus_and(*mask, p->cpus_allowed, cpu_possible_map); + +out_unlock: + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + if (retval) + return retval; + + return 0; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + int ret; + cpumask_t mask; + + if (len < sizeof(cpumask_t)) + return -EINVAL; + + ret = sched_getaffinity(pid, &mask); + if (ret < 0) + return ret; + + if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) + return -EFAULT; + + return sizeof(cpumask_t); +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_max(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_min(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + ret = 0; + } + return ret; +} + +static inline void __cond_resched(void) +{ + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched cond_resched(void) +{ + if (need_resched()) { + __cond_resched(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t * lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock(lock); + cpu_relax(); + spin_lock(lock); + } +#endif + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + spin_lock(lock); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched()) { + __local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_softirq); + +/** + * yield - yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} + +EXPORT_SYMBOL(yield); + +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +static void show_task(task_t * p) +{ + task_t *relative; + unsigned state; + unsigned long free = 0; + static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; + + printk("%-13.13s ", p->comm); + state = p->state ? __ffs(p->state) + 1 : 0; + if (state < ARRAY_SIZE(stat_nam)) + printk(stat_nam[state]); + else + printk("?"); +#if (BITS_PER_LONG == 32) + if (state == TASK_RUNNING) + printk(" running "); + else + printk(" %08lX ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(" running task "); + else + printk(" %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long * n = (unsigned long *) (p->thread_info+1); + while (!*n) + n++; + free = (unsigned long) n - (unsigned long)(p->thread_info+1); + } +#endif + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); + else + printk(" "); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); + else + printk(" "); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); + else + printk(" "); + if (!p->mm) + printk(" (L-TLB)\n"); + else + printk(" (NOTLB)\n"); + + if (state != TASK_RUNNING) + show_stack(p, NULL); +} + +void show_state(void) +{ + task_t *g, *p; + +#if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#else + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); +} + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_MASK_NONE. + */ +cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + +int in_sched_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __sched functions */ + extern char __sched_text_start[], __sched_text_end[]; + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +int try_to_wake_up(task_t *task, unsigned state, int sync); + +int fastcall wake_up_state(task_t *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +int fastcall wake_up_process(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(wake_up_process); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, &q->task_list) { + wait_queue_t *curr; + unsigned flags; + curr = list_entry(tmp, wait_queue_t, task_list); + flags = curr->flags; + if (curr->func(curr, mode, sync, key) && + (flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + */ +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +{ + task_t *p = curr->task; + return try_to_wake_up(p, mode, sync); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} + +/** + * __wake_up - sync- wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + int sync = 1; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + sync = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +void fastcall complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +void fastcall complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +#ifdef CONFIG_SMP +/* cpus with isolated domains */ +cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; + +/* + * init_sched_build_groups takes an array of groups, the cpumask we wish + * to span, and a pointer to a function which identifies what group a CPU + * belongs to. The return value of group_fn must be a valid index into the + * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we + * keep track of groups covered with a cpumask_t). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +void __devinit init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; + + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; + int j; + + if (cpu_isset(i, covered)) + continue; + + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; + + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; + + cpu_set(j, covered); + cpu_set(j, sg->cpumask); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} +#endif + +#ifdef CONFIG_SCHEDSTATS +int show_schedstat(struct seq_file *seq, void *v); +