Index: linux-2.6.10-rc1-mm5/fs/proc/array.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/fs/proc/array.c 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/fs/proc/array.c 2004-11-11 21:50:23.000000000 +1100 @@ -163,7 +163,6 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -171,7 +170,6 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, p->pid ? p->group_leader->real_parent->tgid : 0, p->pid && p->ptrace ? p->parent->pid : 0, Index: linux-2.6.10-rc1-mm5/fs/proc/proc_misc.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/fs/proc/proc_misc.c 2004-11-11 22:08:03.000000000 +1100 +++ linux-2.6.10-rc1-mm5/fs/proc/proc_misc.c 2004-11-11 22:08:30.000000000 +1100 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,18 @@ static int version_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +static int scheduler_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + char *sched_name = scheduler->cpusched_name; + int len; + + strcpy(page, sched_name); + strcat(page, "\n"); + len = strlen(page); + return proc_calc_metrics(page, start, off, count, eof, len); +} + extern struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { @@ -569,6 +582,7 @@ void __init proc_misc_init(void) {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"scheduler", scheduler_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) Index: linux-2.6.10-rc1-mm5/include/linux/init_task.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/init_task.h 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/init_task.h 2004-11-11 21:50:20.000000000 +1100 @@ -72,14 +72,10 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ - .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ @@ -115,5 +111,4 @@ extern struct group_info init_groups; .private_pages = LIST_HEAD_INIT(tsk.private_pages), \ .private_pages_count = 0, \ } - #endif Index: linux-2.6.10-rc1-mm5/include/linux/sched.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/sched.h 2004-11-11 21:42:18.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/sched.h 2004-11-11 22:08:34.000000000 +1100 @@ -32,6 +32,7 @@ #include #include #include +#include struct exec_domain; @@ -165,9 +166,6 @@ extern void show_regs(struct pt_regs *); */ extern void show_stack(struct task_struct *task, unsigned long *sp); -void io_schedule(void); -long io_schedule_timeout(long timeout); - extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); @@ -179,6 +177,9 @@ extern unsigned long cache_decay_ticks; /* Is this address in the __sched functions? */ extern int in_sched_functions(unsigned long addr); +void __sched io_schedule(void); +long __sched io_schedule_timeout(long timeout); + #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); @@ -332,11 +333,6 @@ struct signal_struct { }; /* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are - * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values - * are inverted: lower p->prio value means higher priority. - * * The MAX_USER_RT_PRIO value allows the actual maximum * RT priority to be separate from the value exported to * user-space. This allows kernel threads to set their @@ -347,9 +343,7 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) - -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +extern int rt_task(task_t *p); /* * Some day this will be a full-fledged user tracking system.. @@ -382,21 +376,6 @@ typedef struct prio_array prio_array_t; struct backing_dev_info; struct reclaim_state; -#ifdef CONFIG_SCHEDSTATS -struct sched_info { - /* cumulative counters */ - unsigned long cpu_time, /* time spent on the cpu */ - run_delay, /* time spent waiting on a runqueue */ - pcnt; /* # of timeslices run on this cpu */ - - /* timestamps */ - unsigned long last_arrival, /* when we last ran on a cpu */ - last_queued; /* when we were last queued to run */ -}; - -extern struct file_operations proc_schedstat_operations; -#endif - enum idle_type { SCHED_IDLE, @@ -405,6 +384,8 @@ enum idle_type MAX_IDLE_TYPES }; +#include + /* * sched-domains (multiprocessor balancing) declarations: */ @@ -467,13 +448,11 @@ struct sched_domain { #endif }; -#ifdef ARCH_HAS_SCHED_DOMAIN /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */ -extern cpumask_t cpu_isolated_map; +extern void cpu_attach_domain(struct sched_domain *sd, int cpu); extern void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)); -extern void cpu_attach_domain(struct sched_domain *sd, int cpu); -#endif /* ARCH_HAS_SCHED_DOMAIN */ +extern cpumask_t cpu_isolated_map; #endif /* CONFIG_SMP */ @@ -517,6 +496,10 @@ int set_current_groups(struct group_info struct audit_context; /* See audit.c */ struct mempolicy; +#include + +extern struct sched_drv *scheduler; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -526,17 +509,11 @@ struct task_struct { int lock_depth; /* Lock depth */ - int prio, static_prio; - struct list_head run_list; - prio_array_t *array; - - unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - int activated; + int static_prio; /* A commonality between cpu schedulers */ + union cpusched u; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -747,8 +724,11 @@ extern void sched_idle_next(void); extern void set_user_nice(task_t *p, long nice); extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); +extern int is_idle_task(const task_t *p); extern int task_curr(const task_t *p); extern int idle_cpu(int cpu); +extern void set_oom_timeslice(task_t *p); +extern task_t *find_process_by_pid(pid_t pid); void yield(void); @@ -774,6 +754,7 @@ static inline int kstack_end(void *addr) extern union thread_union init_thread_union; extern struct task_struct init_task; +extern struct task_struct base_init_task; extern struct mm_struct init_mm; @@ -1095,33 +1076,8 @@ extern void recalc_sigpending(void); extern void signal_wake_up(struct task_struct *t, int resume_stopped); -/* - * Wrappers for p->thread_info->cpu access. No-op on UP. - */ -#ifdef CONFIG_SMP - -static inline unsigned int task_cpu(const struct task_struct *p) -{ - return p->thread_info->cpu; -} - -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - p->thread_info->cpu = cpu; -} - -#else - -static inline unsigned int task_cpu(const struct task_struct *p) -{ - return 0; -} - -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -} - -#endif /* CONFIG_SMP */ +extern unsigned int task_cpu(const struct task_struct *p); +extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT extern void arch_pick_mmap_layout(struct mm_struct *mm); Index: linux-2.6.10-rc1-mm5/include/linux/schedstats.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/schedstats.h 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/schedstats.h 2004-11-11 21:53:42.000000000 +1100 @@ -0,0 +1,78 @@ +#ifndef _LINUX_SCHEDSTATS_H +#define _LINUX_SCHEDSTATS_H + +#ifdef CONFIG_SCHEDSTATS +struct sched_info { + /* cumulative counters */ + unsigned long cpu_time, /* time spent on the cpu */ + run_delay, /* time spent waiting on a runqueue */ + pcnt; /* # of timeslices run on this cpu */ + + /* timestamps */ + unsigned long last_arrival, /* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ +}; + +typedef struct schedstat_per_cpu_data schedstat_pcd_t; + +struct schedstat_per_cpu_data { + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* pull_task() stats */ + unsigned long pt_gained[MAX_IDLE_TYPES]; + unsigned long pt_lost[MAX_IDLE_TYPES]; + + /* active_load_balance() stats */ + unsigned long alb_cnt; + unsigned long alb_lost; + unsigned long alb_gained; + unsigned long alb_failed; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_attempts; + unsigned long ttwu_moved; + + /* wake_up_new_task() stats */ + unsigned long wunt_cnt; + unsigned long wunt_moved; + + /* sched_migrate_task() stats */ + unsigned long smt_cnt; + + /* sched_balance_exec() stats */ + unsigned long sbe_cnt; +}; + +extern struct file_operations proc_schedstat_operations; +extern DEFINE_PER_CPU(struct schedstat_per_cpu_data, schedstat_pcd_data); + +#define cpu_sspcd(cpu) (&per_cpu(schedstat_pcd_data, (cpu))) +#define task_sspcd(cpu) (cpu_sspcd(task_cpu(cpu))) + +extern void sched_info_switch(task_t *prev, task_t *next); +extern void sched_info_queued(task_t *t); + +# define schedstat_inc(sspcd, field) sspcd->field++; +# define schedstat_add(sspcd, field, amt) sspcd->field += amt; +#else /* !CONFIG_SCHEDSTATS */ +# define schedstat_inc(sspcd, field) do { } while (0); +# define schedstat_add(sspcd, field, amt) do { } while (0); +# define sched_info_queued(t) do { } while (0) +# define sched_info_switch(t, next) do { } while (0) +#endif + +#endif /* _LINUX_SCHEDSTATS_H */ Index: linux-2.6.10-rc1-mm5/include/linux/scheduler.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/scheduler.h 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/scheduler.h 2004-11-11 22:08:35.000000000 +1100 @@ -0,0 +1,129 @@ +#ifndef _LINUX_SCHEDULER_H +#define _LINUX_SCHEDULER_H +/* + * include/linux/scheduler.h + * This contains the driver struct for all the exported per-cpu-scheduler + * functions, and the private per-scheduler data in task_struct. + */ + +#define SCHED_NAME_MAX (16) + +/* + * This is the main scheduler driver struct. + */ +struct sched_drv +{ + unsigned int (*task_cpu)(const struct task_struct *); + void (*set_task_cpu)(struct task_struct *, unsigned int); + void (*init_sched_domain_sysctl)(void); + void (*destroy_sched_domain_sysctl)(void); + char cpusched_name[SCHED_NAME_MAX]; + int (*rt_task)(const task_t *); + void (*wait_for_completion)(struct completion *); + void (*io_schedule)(void); + long (*io_schedule_timeout)(long); + void (*sched_idle_next)(void); + void (*set_oom_timeslice)(task_t *); + unsigned long (*nr_running)(void); + unsigned long (*nr_uninterruptible)(void); + unsigned long long (*nr_context_switches)(void); + unsigned long (*nr_iowait)(void); + unsigned long (*nr_iowait_task_cpu)(const task_t *); + int (*idle_cpu)(int); + void (*init_idle)(task_t *, int); + void (*exit)(task_t *); + void (*fork)(task_t *); + void (*init)(void); + void (*init_smp)(void); + void (*schedule)(void); + void (*tick)(void); + void (*tail)(task_t *); + int (*setscheduler)(pid_t, int, struct sched_param __user *); + void (*set_user_nice)(task_t *, long); + long (*rr_get_interval)(pid_t, struct timespec __user *); + long (*yield)(void); + int (*is_idle_task)(const task_t *); + int (*task_curr)(const task_t *); + int (*task_nice)(const task_t *); + int (*task_prio)(const task_t *); + int (*try_to_wake_up)(task_t *, unsigned, int); + void (*wake_up_new_task)(task_t *, unsigned long); +#ifdef CONFIG_SMP + int (*migration_init)(void); + void (*exec)(void); + int (*set_cpus_allowed)(task_t *, cpumask_t); + void (*wait_task_inactive)(task_t *); + void (*cpu_attach_domain)(struct sched_domain *, int); +#ifdef CONFIG_SCHEDSTATS + void (*show_schedstat_sd)(struct seq_file *, int); +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + void (*normalize_rt_tasks)(void); +#endif +#ifdef CONFIG_KGDB + struct task_struct * (*kgdb_get_idle)(int); +#endif +}; + +/* + * List functions that have common variants that many schedulers use. + */ +extern unsigned int common_task_cpu(const struct task_struct *p); +extern void common_set_task_cpu(struct task_struct *p, unsigned int cpu); + +/* + * All private per-scheduler entries in task_struct are defined here as + * separate structs placed into the cpusched union in task_struct. + */ + +/* Ingosched */ +#ifdef CONFIG_CPUSCHED_INGO +struct cpusched_ingo { + int prio; + struct list_head run_list; + prio_array_t *array; + unsigned int time_slice; + unsigned int first_time_slice; + unsigned long sleep_avg; + unsigned long timestamp; + unsigned long long last_ran; + int activated; +}; +#endif + +/* Staircase scheduler */ +#ifdef CONFIG_CPUSCHED_STAIRCASE +struct cpusched_sc { + int prio; + struct list_head run_list; + unsigned long sflags; + unsigned long long timestamp; + unsigned long runtime, totalrun, ns_debit; + unsigned int burst; + unsigned int slice, time_slice; +}; +#endif + +/* Minisched scheduler */ +#ifdef CONFIG_CPUSCHED_MINISCHED +struct cpusched_ms { + int prio; + struct list_head run_list; + unsigned int time_slice; +}; +#endif + +union cpusched { +#ifdef CONFIG_CPUSCHED_INGO + struct cpusched_ingo ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct cpusched_sc scsched; +#endif +#ifdef CONFIG_CPUSCHED_MINISCHED + struct cpusched_ms mssched; +#endif +}; + +#endif Index: linux-2.6.10-rc1-mm5/init/Kconfig =================================================================== --- linux-2.6.10-rc1-mm5.orig/init/Kconfig 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/init/Kconfig 2004-11-11 22:08:35.000000000 +1100 @@ -249,6 +249,48 @@ config IKCONFIG_PROC through /proc/config.gz. +config PLUGSCHED + bool "Support for multiple cpu schedulers" + default y + help + Say Y here if you want to compile in support for multiple + cpu schedulers. The cpu scheduler may be selected at boot time + with the boot parameter "cpusched=". The choice of which cpu + schedulers to compile into the kernel can be made by enabling + "Configure standard kernel features" otherwise all cpu schedulers + supported will be compiled in. + +choice + prompt "Default cpu scheduler" + help + This option allows you to choose which cpu scheduler shall be + booted by default at startup if you have plugsched support, or + it will choose which is the only scheduler compiled in. + +config CPUSCHED_DEFAULT_INGO + bool "Ingosched cpu scheduler" + select CPUSCHED_INGO + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + +config CPUSCHED_DEFAULT_STAIRCASE + bool "Staircase cpu scheduler" + select CPUSCHED_STAIRCASE + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + +config CPUSCHED_DEFAULT_MINISCHED + bool "Minisched cpu scheduler" + depends on !SMP + select CPUSCHED_MINISCHED + ---help--- + This scheduler is a low overhead O(1) single priority rr scheduler + for uniprocessor only. + +endchoice + menuconfig EMBEDDED bool "Configure standard kernel features (for small systems)" help @@ -257,6 +299,36 @@ menuconfig EMBEDDED environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config CPUSCHED_INGO + bool "Ingosched cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=ingosched". + +config CPUSCHED_STAIRCASE + bool "Staircase cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=staircase". + +config CPUSCHED_MINISCHED + bool "Minisched cpu scheduler" if EMBEDDED + depends on PLUGSCHED && !SMP + default y + ---help--- + This scheduler is a low overhead O(1) single priority rr scheduler + for uniprocessor only. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=minisched". + config KALLSYMS bool "Load all symbols for debugging/kksymoops" if EMBEDDED default y Index: linux-2.6.10-rc1-mm5/init/main.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/init/main.c 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/init/main.c 2004-11-11 21:52:40.000000000 +1100 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -416,10 +417,11 @@ void __init parse_early_param(void) done = 1; } +struct task_struct base_init_task; + /* * Activate the first processor. */ - asmlinkage void __init start_kernel(void) { char * command_line; @@ -441,6 +443,11 @@ asmlinkage void __init start_kernel(void smp_prepare_boot_cpu(); /* + * Save a copy of the baseline init_task in case we need to start + * another cpu scheduler. + */ + base_init_task = init_task; + /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. @@ -519,6 +526,7 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ + printk("Running with %s cpu scheduler.\n", scheduler->cpusched_name); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } Index: linux-2.6.10-rc1-mm5/kernel/Makefile =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/Makefile 2004-11-11 21:42:17.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/Makefile 2004-11-11 22:08:35.000000000 +1100 @@ -2,13 +2,16 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ +obj-y = scheduler.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o +obj-$(CONFIG_CPUSCHED_INGO) += sched.o +obj-$(CONFIG_CPUSCHED_STAIRCASE) += staircase.o +obj-$(CONFIG_CPUSCHED_MINISCHED) += minisched.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o Index: linux-2.6.10-rc1-mm5/kernel/minisched.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/minisched.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/minisched.c 2004-11-11 22:08:35.000000000 +1100 @@ -0,0 +1,960 @@ +/* + * kernel/minisched.c + * + * This is "minisched"; a minimalist uniprocessor scheduler. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_PRIO (MAX_RT_PRIO + 1) + +#define RR_INTERVAL (10 * HZ / 1000 ? : 1) + +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +static unsigned int task_timeslice(task_t *p) +{ + return NICE_TO_PRIO(p->static_prio) * RR_INTERVAL; +} + +typedef struct runqueue runqueue_t; + +/* + * This is the runqueue data structure. + */ +struct runqueue { + spinlock_t lock; + + unsigned long nr_running; + unsigned long long nr_switches; + unsigned long nr_uninterruptible; + task_t *curr, *idle; + struct mm_struct *prev_mm; + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO+1)]; + struct list_head queue[MAX_PRIO + 1]; + atomic_t nr_iowait; +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +static runqueue_t *rq = &per_cpu(runqueues, 0); + +static int ms_rt_task(const task_t *p) +{ + return (unlikely((p)->u.mssched.prio < MAX_RT_PRIO)); +} + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +# define finish_arch_switch(next) spin_unlock_irq(&rq->lock) +# define task_running(p) (rq->curr == (p)) +#endif + +/* + * task_rq_lock - lock the runqueue and disable + * interrupts. + */ +static void task_rq_lock(unsigned long *flags) +{ + local_irq_save(*flags); + spin_lock(&rq->lock); +} + +static void task_rq_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock the runqueue and disable interrupts. + */ +static void rq_lock(void) +{ + local_irq_disable(); + spin_lock(&rq->lock); +} + +static int task_queued(task_t *task) +{ + return !list_empty(&task->u.mssched.run_list); +} + +/* + * Adding/removing a task to/from a runqueue: + */ +static void dequeue_task(struct task_struct *p) +{ + list_del_init(&p->u.mssched.run_list); + if (list_empty(rq->queue + p->u.mssched.prio)) + __clear_bit(p->u.mssched.prio, rq->bitmap); +} + +static void enqueue_task(struct task_struct *p) +{ + list_add_tail(&p->u.mssched.run_list, rq->queue + p->u.mssched.prio); + __set_bit(p->u.mssched.prio, rq->bitmap); +} + +static void requeue_task(struct task_struct *p) +{ + list_move_tail(&p->u.mssched.run_list, rq->queue + p->u.mssched.prio); +} + +static void ms_set_oom_timeslice(task_t *p) +{ + p->u.mssched.time_slice = HZ; +} + +static void __activate_task(task_t *p) +{ + enqueue_task(p); + rq->nr_running++; +} + +static void activate_task(task_t *p) +{ + p->u.mssched.time_slice = task_timeslice(p); + __activate_task(p); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p) +{ + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p); +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag; + */ +static inline void resched_task(task_t *p) +{ + set_tsk_need_resched(p); +} + +/** + * task_curr - is this task currently executing? + * @p: the task in question. + */ +static int ms_task_curr(const task_t *p) +{ + return (rq->curr == p); +} + +/* + * Check to see if p preempts rq->curr and resched if it only if it is a + * real time task. + */ +static void preempt(task_t *p) +{ + if (likely(p->u.mssched.prio == rq->curr->u.mssched.prio)) { + /* This is true for all non rt tasks */ + if (p->u.mssched.time_slice > rq->curr->u.mssched.time_slice) + /* This selects out higher priority normal tasks */ + resched_task(rq->curr); + goto out; + } + if (p->u.mssched.prio > rq->curr->u.mssched.prio) + /* + * This is a lower priority real time task or a normal task + * While a real time task is running. + */ + goto out; + resched_task(rq->curr); +out: + return; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int ms_try_to_wake_up(task_t * p, unsigned int state, int sync) +{ + int success = 0; + unsigned long flags; + long old_state; + + task_rq_lock(&flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (task_queued(p)) + goto out_running; + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p); + if (!sync) + preempt(p); + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(&flags); + + return success; +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void ms_sched_fork(task_t *p) +{ + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->u.mssched.run_list); + spin_lock_init(&p->switch_lock); +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#ifdef CONFIG_PREEMPT + /* + * During context-switch we hold precisely one spinlock, which + * schedule_tail drops. (in the common case it's rq->lock, + * but it also can be p->switch_lock.) So we compensate with a count + * of 1. Also, we want to start with kernel preemption disabled. + */ + p->thread_info->preempt_count = 1; +#endif +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void ms_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + + task_rq_lock(&flags); + + BUG_ON(p->state != TASK_RUNNING); + + __activate_task(p); + task_rq_unlock(&flags); +} + +static void ms_sched_exit(task_t * p) +{ +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(task_t *prev) +{ + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +static void ms_schedule_tail(task_t *prev) +{ + finish_task_switch(prev); + + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +static unsigned long ms_nr_running(void) +{ + return rq->nr_running; +} + +static unsigned long ms_nr_uninterruptible(void) +{ + return rq->nr_uninterruptible; +} + +static unsigned long long ms_nr_context_switches(void) +{ + return rq->nr_switches; +} + +static unsigned long ms_nr_iowait(void) +{ + return atomic_read(&rq->nr_iowait); +} + +static unsigned long ms_nr_iowait_task_cpu(const task_t *p) +{ + return atomic_read(&rq->nr_iowait); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +static void ms_scheduler_tick(void) +{ + task_t *p = current; + + if (p == rq->idle) + return; + + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(!task_queued(p))) { + set_tsk_need_resched(p); + return; + } + + /* + * SCHED_FIFO tasks never run out of timeslice. + */ + if (unlikely(p->policy == SCHED_FIFO)) + return; + + spin_lock(&rq->lock); + + if (!--p->u.mssched.time_slice) { + p->u.mssched.time_slice = task_timeslice(p); + set_tsk_need_resched(p); + requeue_task(p); + } + spin_unlock(&rq->lock); +} + +/* + * schedule() is the main scheduler function. + */ +static void __sched ms_schedule(void) +{ + long *switch_count; + task_t *prev, *next; + + struct list_head *queue; + int idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } + } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); + +need_resched_nonpreemptible: + spin_lock_irq(&rq->lock); + + if (unlikely(current->flags & PF_DEAD)) + current->state = EXIT_DEAD; + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else + deactivate_task(prev); + } + + if (unlikely(!rq->nr_running)) { + next = rq->idle; + goto switch_tasks; + } + + idx = sched_find_first_bit(rq->bitmap); + queue = rq->queue + idx; + next = list_entry(queue->next, task_t, u.mssched.run_list); + +switch_tasks: + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(next); + prev = context_switch(prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) + goto need_resched_nonpreemptible; + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +static void __sched ms_wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} + +static void ms_set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + task_rq_lock(&flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + + p->static_prio = NICE_TO_PRIO(nice); + +out_unlock: + task_rq_unlock(&flags); +} + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are all 0. + */ +static int ms_task_prio(const task_t *p) +{ + return p->u.mssched.prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +static int ms_task_nice(const task_t *p) +{ + return TASK_NICE(p); +} + +/** + * idle_cpu - is the cpu idle currently? + */ +static int ms_idle_cpu(int cpu) +{ + return rq->curr == rq->idle; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + BUG_ON(task_queued(p)); + p->policy = policy; + p->rt_priority = prio; + if (policy != SCHED_NORMAL) + p->u.mssched.prio = MAX_RT_PRIO - 1 - p->rt_priority; + else + p->u.mssched.prio = MAX_RT_PRIO; +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static int ms_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + int queued, oldprio, oldpolicy = -1; + unsigned long flags; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock; +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL) + goto out_unlock; + } + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) + goto out_unlock; + if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p, policy, &lp); + if (retval) + goto out_unlock; + /* + * To be able to change p->policy safely, the + * runqueue lock must be held. + */ + task_rq_lock(&flags); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(&flags); + goto recheck; + } + if ((queued = task_queued(p))) + deactivate_task(p); + retval = 0; + oldprio = p->u.mssched.prio; + __setscheduler(p, policy, lp.sched_priority); + if (queued) { + __activate_task(p); + /* + * Reschedule if we are currently running and + * our priority decreased, or if we are not currently running + * and our priority is higher than the current's + */ + if (task_running(p)) { + if (p->u.mssched.prio > oldprio) + resched_task(rq->curr); + } else + preempt(p); + } + task_rq_unlock(&flags); +out_unlock: + read_unlock_irq(&tasklist_lock); +out_nounlock: + return retval; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by dropping to end of the runqueue. + */ +static long ms_sys_sched_yield(void) +{ + task_t *p = current; + rq_lock(); + + set_tsk_need_resched(p); + requeue_task(current); + current->u.mssched.time_slice = task_timeslice(current); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +static void __sched ms_io_schedule(void) +{ + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +static long __sched ms_io_schedule_timeout(long timeout) +{ + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +static long +ms_sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static void __devinit ms_init_idle(task_t *idle, int cpu) +{ + unsigned long flags; + + idle->u.mssched.prio = MAX_RT_PRIO + 1; + idle->state = TASK_RUNNING; + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; + set_tsk_need_resched(idle); + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +static void __init ms_sched_init_smp(void) +{ +} + +static void __init ms_sched_init(void) +{ + int i; + + init_task.u.mssched.prio = MAX_RT_PRIO; + init_task.static_prio = MAX_RT_PRIO + 20; + INIT_LIST_HEAD(&init_task.u.mssched.run_list); + init_task.u.mssched.time_slice = HZ; + + spin_lock_init(&rq->lock); + + atomic_set(&rq->nr_iowait, 0); + for (i = 0; i <= MAX_PRIO; i++) + INIT_LIST_HEAD(&rq->queue[i]); + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO + 1)*sizeof(long)); + /* + * delimiter for bitsearch + */ + __set_bit(MAX_PRIO + 1, rq->bitmap); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, 0); +} + +static void ms_init_sched_domain_sysctl(void) +{ +} +static void ms_destroy_sched_domain_sysctl(void) +{ +} + +#ifdef CONFIG_MAGIC_SYSRQ +void ms_normalise_rt_tasks(void) +{ + struct task_struct *p; + unsigned long flags; + int queued; + + read_lock_irq(&tasklist_lock); + for_each_process (p) { + if (!rt_task(p)) + continue; + + task_rq_lock(&flags); + + if ((queued = task_queued(p))) + deactivate_task(p); + __setscheduler(p, SCHED_NORMAL, 0); + if (queued) { + __activate_task(p); + resched_task(rq->curr); + } + + task_rq_unlock(&flags); + } + read_unlock_irq(&tasklist_lock); +} +#endif + +#ifdef CONFIG_KGDB +static struct task_struct *ms_kgdb_get_idle(int this_cpu) +{ + return rq->idle; +} +#endif + +struct sched_drv ms_sched_drv = { + .task_cpu = common_task_cpu, + .set_task_cpu = common_set_task_cpu, + .init_sched_domain_sysctl = ms_init_sched_domain_sysctl, + .destroy_sched_domain_sysctl = ms_destroy_sched_domain_sysctl, + .cpusched_name = "minisched", + .rt_task = ms_rt_task, + .wait_for_completion = ms_wait_for_completion, + .io_schedule = ms_io_schedule, + .io_schedule_timeout = ms_io_schedule_timeout, + .set_oom_timeslice = ms_set_oom_timeslice, + .nr_running = ms_nr_running, + .nr_uninterruptible = ms_nr_uninterruptible, + .nr_context_switches = ms_nr_context_switches, + .nr_iowait = ms_nr_iowait, + .nr_iowait_task_cpu = ms_nr_iowait_task_cpu, + .idle_cpu = ms_idle_cpu, + .init_idle = ms_init_idle, + .exit = ms_sched_exit, + .fork = ms_sched_fork, + .init = ms_sched_init, + .init_smp = ms_sched_init_smp, + .schedule = ms_schedule, + .tick = ms_scheduler_tick, + .tail = ms_schedule_tail, + .setscheduler = ms_setscheduler, + .set_user_nice = ms_set_user_nice, + .rr_get_interval = ms_sys_sched_rr_get_interval, + .yield = ms_sys_sched_yield, + .task_curr = ms_task_curr, + .task_nice = ms_task_nice, + .task_prio = ms_task_prio, + .try_to_wake_up = ms_try_to_wake_up, + .wake_up_new_task = ms_wake_up_new_task, +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_tasks = ms_normalise_rt_tasks, +#endif +#ifdef CONFIG_KGDB + .kgdb_get_idle = ms_kgdb_get_idle, +#endif +}; Index: linux-2.6.10-rc1-mm5/kernel/sched.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/sched.c 2004-11-11 21:42:18.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/sched.c 2004-11-11 22:08:34.000000000 +1100 @@ -1,7 +1,7 @@ /* * kernel/sched.c * - * Kernel scheduler and related syscalls + * This is "ingosched"; the default cpu scheduler. * * Copyright (C) 1991-2002 Linus Torvalds * @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,15 @@ #endif /* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->prio value means higher priority. + */ + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -130,7 +140,7 @@ */ #define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + (NS_TO_JIFFIES((p)->u.ingosched.sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG) #define GRANULARITY (10 * HZ / 1000 ? : 1) @@ -151,14 +161,14 @@ (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) + ((p)->u.ingosched.prio <= (p)->static_prio - DELTA(p)) #define INTERACTIVE_SLEEP(p) \ (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) #define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) + ((p)->u.ingosched.prio < (rq)->curr->u.ingosched.prio) /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] @@ -179,7 +189,7 @@ static unsigned int task_timeslice(task_ else return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); } -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ +#define task_hot(p, now, sd) ((long long) ((now) - (p)->u.ingosched.last_ran) \ < (long long) (sd)->cache_hot_time) /* @@ -235,45 +245,7 @@ struct runqueue { #endif #ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - - /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; - unsigned long yld_cnt; - - /* schedule() stats */ - unsigned long sched_noswitch; - unsigned long sched_switch; - unsigned long sched_cnt; - unsigned long sched_goidle; - - /* pull_task() stats */ - unsigned long pt_gained[MAX_IDLE_TYPES]; - unsigned long pt_lost[MAX_IDLE_TYPES]; - - /* active_load_balance() stats */ - unsigned long alb_cnt; - unsigned long alb_lost; - unsigned long alb_gained; - unsigned long alb_failed; - - /* try_to_wake_up() stats */ - unsigned long ttwu_cnt; - unsigned long ttwu_attempts; - unsigned long ttwu_moved; - - /* wake_up_new_task() stats */ - unsigned long wunt_cnt; - unsigned long wunt_moved; - - /* sched_migrate_task() stats */ - unsigned long smt_cnt; - - /* sched_balance_exec() stats */ - unsigned long sbe_cnt; + schedstat_pcd_t *sspcd; #endif }; @@ -287,6 +259,11 @@ static DEFINE_PER_CPU(struct runqueue, r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static int ingo_rt_task(const task_t *p) +{ + return (unlikely((p)->u.ingosched.prio < MAX_RT_PRIO)); +} + /* * Default context-switch locking: */ @@ -323,105 +300,6 @@ static inline void task_rq_unlock(runque spin_unlock_irqrestore(&rq->lock, *flags); } -#ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 10 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - enum idle_type itype; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - runqueue_t *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcnt = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " - "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, - rq->yld_cnt, rq->sched_noswitch, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->alb_cnt, rq->alb_gained, rq->alb_lost, - rq->alb_failed, - rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts, - rq->wunt_cnt, rq->wunt_moved, - rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) - seq_printf(seq, " %lu %lu", rq->pt_gained[itype], - rq->pt_lost[itype]); - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - for_each_domain(cpu, sd) { - char mask_str[NR_CPUS]; - - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu", - sd->lb_cnt[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, " %lu %lu %lu %lu\n", - sd->sbe_pushed, sd->sbe_attempts, - sd->ttwu_wake_affine, sd->ttwu_wake_balance); - } -#endif - } - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -# define schedstat_inc(rq, field) rq->field++; -# define schedstat_add(rq, field, amt) rq->field += amt; -#else /* !CONFIG_SCHEDSTATS */ -# define schedstat_inc(rq, field) do { } while (0); -# define schedstat_add(rq, field, amt) do { } while (0); -#endif - /* * rq_lock - lock a given runqueue and disable interrupts. */ @@ -459,130 +337,24 @@ static int cpu_and_siblings_are_idle(int #define cpu_and_siblings_are_idle(A) idle_cpu(A) #endif -#ifdef CONFIG_SCHEDSTATS -/* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. - */ -static inline void sched_info_dequeued(task_t *t) -{ - t->sched_info.last_queued = 0; -} - -/* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. - */ -static inline void sched_info_arrive(task_t *t) -{ - unsigned long now = jiffies, diff = 0; - struct runqueue *rq = task_rq(t); - - if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += diff; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; - - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; -} - -/* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. - */ -static inline void sched_info_queued(task_t *t) -{ - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; -} - -/* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. - */ -static inline void sched_info_depart(task_t *t) -{ - struct runqueue *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; - - t->sched_info.cpu_time += diff; - - if (rq) - rq->rq_sched_info.cpu_time += diff; -} - -/* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. - */ -static inline void sched_info_switch(task_t *prev, task_t *next) -{ - struct runqueue *rq = task_rq(prev); - - /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. - */ - if (prev != rq->idle) - sched_info_depart(prev); - - if (next != rq->idle) - sched_info_arrive(next); -} -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ - /* * Adding/removing a task to/from a priority array: */ static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + list_del(&p->u.ingosched.run_list); + if (list_empty(array->queue + p->u.ingosched.prio)) + __clear_bit(p->u.ingosched.prio, array->bitmap); } static void enqueue_task(struct task_struct *p, prio_array_t *array) { sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); + list_add_tail(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); + __set_bit(p->u.ingosched.prio, array->bitmap); array->nr_active++; - p->array = array; + p->u.ingosched.array = array; } /* @@ -591,7 +363,7 @@ static void enqueue_task(struct task_str */ static void requeue_task(struct task_struct *p, prio_array_t *array) { - list_move_tail(&p->run_list, array->queue + p->prio); + list_move_tail(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); } /* @@ -601,10 +373,15 @@ static void requeue_task(struct task_str */ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); + list_add(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); + __set_bit(p->u.ingosched.prio, array->bitmap); array->nr_active++; - p->array = array; + p->u.ingosched.array = array; +} + +static void ingo_set_oom_timeslice(task_t *p) +{ + p->u.ingosched.time_slice = HZ; } /* @@ -626,7 +403,7 @@ static int effective_prio(task_t *p) int bonus, prio; if (rt_task(p)) - return p->prio; + return p->u.ingosched.prio; bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; @@ -658,7 +435,7 @@ static inline void __activate_idle_task( static void recalc_task_prio(task_t *p, unsigned long long now) { - unsigned long long __sleep_time = now - p->timestamp; + unsigned long long __sleep_time = now - p->u.ingosched.timestamp; unsigned long sleep_time; if (__sleep_time > NS_MAX_SLEEP_AVG) @@ -673,9 +450,9 @@ static void recalc_task_prio(task_t *p, * prevent them suddenly becoming cpu hogs and starving * other processes. */ - if (p->mm && p->activated != -1 && + if (p->mm && p->u.ingosched.activated != -1 && sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + p->u.ingosched.sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - DEF_TIMESLICE); } else { /* @@ -689,12 +466,12 @@ static void recalc_task_prio(task_t *p, * limited in their sleep_avg rise as they * are likely to be waiting on I/O */ - if (p->activated == -1 && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + if (p->u.ingosched.activated == -1 && p->mm) { + if (p->u.ingosched.sleep_avg >= INTERACTIVE_SLEEP(p)) sleep_time = 0; - else if (p->sleep_avg + sleep_time >= + else if (p->u.ingosched.sleep_avg + sleep_time >= INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); + p->u.ingosched.sleep_avg = INTERACTIVE_SLEEP(p); sleep_time = 0; } } @@ -707,14 +484,14 @@ static void recalc_task_prio(task_t *p, * task spends sleeping, the higher the average gets - * and the higher the priority boost gets as well. */ - p->sleep_avg += sleep_time; + p->u.ingosched.sleep_avg += sleep_time; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; + if (p->u.ingosched.sleep_avg > NS_MAX_SLEEP_AVG) + p->u.ingosched.sleep_avg = NS_MAX_SLEEP_AVG; } } - p->prio = effective_prio(p); + p->u.ingosched.prio = effective_prio(p); } /* @@ -743,7 +520,7 @@ static void activate_task(task_t *p, run * This checks to make sure it's not an uninterruptible task * that is now waking up. */ - if (!p->activated) { + if (!p->u.ingosched.activated) { /* * Tasks which were woken up by interrupts (ie. hw events) * are most likely of interactive nature. So we give them @@ -752,16 +529,16 @@ static void activate_task(task_t *p, run * on a CPU, first time around: */ if (in_interrupt()) - p->activated = 2; + p->u.ingosched.activated = 2; else { /* * Normal first-time wakeups get a credit too for * on-runqueue time, but it will be weighted down: */ - p->activated = 1; + p->u.ingosched.activated = 1; } } - p->timestamp = now; + p->u.ingosched.timestamp = now; __activate_task(p, rq); } @@ -774,8 +551,8 @@ static void deactivate_task(struct task_ rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(p, p->array); - p->array = NULL; + dequeue_task(p, p->u.ingosched.array); + p->u.ingosched.array = NULL; } /* @@ -811,7 +588,7 @@ static inline void resched_task(task_t * * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ -inline int task_curr(const task_t *p) +static int ingo_task_curr(const task_t *p) { return cpu_curr(task_cpu(p)) == p; } @@ -848,7 +625,7 @@ static int migrate_task(task_t *p, int d * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->u.ingosched.array && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -870,7 +647,7 @@ static int migrate_task(task_t *p, int d * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +static void ingo_wait_task_inactive(task_t * p) { unsigned long flags; runqueue_t *rq; @@ -879,7 +656,7 @@ void wait_task_inactive(task_t * p) repeat: rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array)) { + if (unlikely(p->u.ingosched.array)) { /* If it's preempted, we yield. It could be a while. */ preempted = !task_running(rq, p); task_rq_unlock(rq, &flags); @@ -891,24 +668,6 @@ repeat: task_rq_unlock(rq, &flags); } -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - */ -void kick_process(task_t *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} - /* * Return a low guess at the load of a migration-source cpu. * @@ -988,7 +747,7 @@ static inline int wake_idle(int cpu, tas * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int ingo_try_to_wake_up(task_t * p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1001,12 +760,12 @@ static int try_to_wake_up(task_t * p, un #endif old_rq = rq = task_rq_lock(p, &flags); - schedstat_inc(rq, ttwu_cnt); + schedstat_inc(rq->sspcd, ttwu_cnt); old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->u.ingosched.array) goto out_running; cpu = task_cpu(p); @@ -1074,10 +833,10 @@ static int try_to_wake_up(task_t * p, un new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: - schedstat_inc(rq, ttwu_attempts); + schedstat_inc(rq->sspcd, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu) { - schedstat_inc(rq, ttwu_moved); + schedstat_inc(rq->sspcd, ttwu_moved); set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ @@ -1085,7 +844,7 @@ out_set_cpu: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->u.ingosched.array) goto out_running; this_cpu = smp_processor_id(); @@ -1100,7 +859,7 @@ out_activate: * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. */ - p->activated = -1; + p->u.ingosched.activated = -1; } /* @@ -1126,19 +885,6 @@ out: return success; } -int fastcall wake_up_process(task_t * p) -{ - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); -} - -EXPORT_SYMBOL(wake_up_process); - -int fastcall wake_up_state(task_t *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - #ifdef CONFIG_SMP static int find_idlest_cpu(struct task_struct *p, int this_cpu, struct sched_domain *sd); @@ -1148,7 +894,7 @@ static int find_idlest_cpu(struct task_s * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p) +static void ingo_sched_fork(task_t *p) { /* * We mark the process as running here, but have not actually @@ -1157,8 +903,8 @@ void fastcall sched_fork(task_t *p) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; - INIT_LIST_HEAD(&p->run_list); - p->array = NULL; + INIT_LIST_HEAD(&p->u.ingosched.run_list); + p->u.ingosched.array = NULL; spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -1178,21 +924,21 @@ void fastcall sched_fork(task_t *p) * resulting in more scheduling fairness. */ local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + p->u.ingosched.time_slice = (current->u.ingosched.time_slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { + p->u.ingosched.first_time_slice = 1; + current->u.ingosched.time_slice >>= 1; + p->u.ingosched.timestamp = sched_clock(); + if (unlikely(!current->u.ingosched.time_slice)) { /* * This case is rare, it happens when the parent has only * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ - current->time_slice = 1; + current->u.ingosched.time_slice = 1; preempt_disable(); scheduler_tick(); local_irq_enable(); @@ -1208,7 +954,7 @@ void fastcall sched_fork(task_t *p) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +static void ingo_wake_up_new_task(task_t * p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1220,17 +966,17 @@ void fastcall wake_up_new_task(task_t * BUG_ON(p->state != TASK_RUNNING); - schedstat_inc(rq, wunt_cnt); + schedstat_inc(rq->sspcd, wunt_cnt); /* * We decrease the sleep average of forking parents * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. The parent * (current) is done further down, under its lock. */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + p->u.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->prio = effective_prio(p); + p->u.ingosched.prio = effective_prio(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { @@ -1239,13 +985,13 @@ void fastcall wake_up_new_task(task_t * * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) + if (unlikely(!current->u.ingosched.array)) __activate_task(p, rq); else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; + p->u.ingosched.prio = current->u.ingosched.prio; + list_add_tail(&p->u.ingosched.run_list, ¤t->u.ingosched.run_list); + p->u.ingosched.array = current->u.ingosched.array; + p->u.ingosched.array->nr_active++; rq->nr_running++; } set_need_resched(); @@ -1266,21 +1012,21 @@ void fastcall wake_up_new_task(task_t * * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + p->u.ingosched.timestamp = (p->u.ingosched.timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); - schedstat_inc(rq, wunt_moved); + schedstat_inc(rq->sspcd, wunt_moved); /* * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: + * parent runqueue to update the parent's ->u.ingosched.sleep_avg: */ task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + current->u.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); task_rq_unlock(this_rq, &flags); } @@ -1294,7 +1040,7 @@ void fastcall wake_up_new_task(task_t * * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +static void ingo_sched_exit(task_t * p) { unsigned long flags; runqueue_t *rq; @@ -1304,14 +1050,14 @@ void fastcall sched_exit(task_t * p) * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + if (p->u.ingosched.first_time_slice) { + p->parent->u.ingosched.time_slice += p->u.ingosched.time_slice; + if (unlikely(p->parent->u.ingosched.time_slice > task_timeslice(p))) + p->parent->u.ingosched.time_slice = task_timeslice(p); + } + if (p->u.ingosched.sleep_avg < p->parent->u.ingosched.sleep_avg) + p->parent->u.ingosched.sleep_avg = p->parent->u.ingosched.sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->u.ingosched.sleep_avg / (EXIT_WEIGHT + 1); task_rq_unlock(rq, &flags); } @@ -1361,7 +1107,7 @@ static void finish_task_switch(task_t *p * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(task_t *prev) +static void ingo_schedule_tail(task_t *prev) __releases(rq->lock) { finish_task_switch(prev); @@ -1406,7 +1152,7 @@ task_t * context_switch(runqueue_t *rq, * threads, current number of uninterruptible-sleeping threads, total * number of context switches performed since bootup. */ -unsigned long nr_running(void) +static unsigned long ingo_nr_running(void) { unsigned long i, sum = 0; @@ -1416,7 +1162,7 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) +static unsigned long ingo_nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -1426,7 +1172,7 @@ unsigned long nr_uninterruptible(void) return sum; } -unsigned long long nr_context_switches(void) +static unsigned long long ingo_nr_context_switches(void) { unsigned long long i, sum = 0; @@ -1436,7 +1182,7 @@ unsigned long long nr_context_switches(v return sum; } -unsigned long nr_iowait(void) +static unsigned long ingo_nr_iowait(void) { unsigned long i, sum = 0; @@ -1446,6 +1192,11 @@ unsigned long nr_iowait(void) return sum; } +static unsigned long ingo_nr_iowait_task_cpu(const task_t *p) +{ + return atomic_read(&task_rq(p)->nr_iowait); +} + #ifdef CONFIG_SMP /* @@ -1569,7 +1320,7 @@ static void sched_migrate_task(task_t *p || unlikely(cpu_is_offline(dest_cpu))) goto out; - schedstat_inc(rq, smt_cnt); + schedstat_inc(rq->sspcd, smt_cnt); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -1592,12 +1343,12 @@ out: * execve() is a valuable balancing opportunity, because at this point * the task has the smallest effective memory and cache footprint. */ -void sched_exec(void) +static void ingo_sched_exec(void) { struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - schedstat_inc(this_rq(), sbe_cnt); + schedstat_inc(this_rq()->sspcd, sbe_cnt); /* Prefer the current CPU if there's only this task running */ if (this_rq()->nr_running <= 1) goto out; @@ -1633,7 +1384,7 @@ void pull_task(runqueue_t *src_rq, prio_ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + p->u.ingosched.timestamp = (p->u.ingosched.timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of MAX_PRIO, for this test @@ -1729,7 +1480,7 @@ skip_bitmap: head = array->queue + idx; curr = head->prev; skip_queue: - tmp = list_entry(curr, task_t, run_list); + tmp = list_entry(curr, task_t, u.ingosched.run_list); curr = curr->prev; @@ -1745,8 +1496,8 @@ skip_queue: * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ - schedstat_inc(this_rq, pt_gained[idle]); - schedstat_inc(busiest, pt_lost[idle]); + schedstat_inc(this_rq->sspcd, pt_gained[idle]); + schedstat_inc(busiest->sspcd, pt_lost[idle]); pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; @@ -2103,7 +1854,7 @@ static void active_load_balance(runqueue cpumask_t visited_cpus; int cpu; - schedstat_inc(busiest_rq, alb_cnt); + schedstat_inc(busiest_rq->sspcd, alb_cnt); /* * Search for suitable CPUs to push tasks to in successively higher * domains with SD_LOAD_BALANCE set. @@ -2138,10 +1889,10 @@ static void active_load_balance(runqueue double_lock_balance(busiest_rq, target_rq); if (move_tasks(target_rq, cpu, busiest_rq, 1, sd, SCHED_IDLE)) { - schedstat_inc(busiest_rq, alb_lost); - schedstat_inc(target_rq, alb_gained); + schedstat_inc(busiest_rq->sspcd, alb_lost); + schedstat_inc(target_rq->sspcd, alb_gained); } else { - schedstat_inc(busiest_rq, alb_failed); + schedstat_inc(busiest_rq->sspcd, alb_failed); } spin_unlock(&target_rq->lock); } @@ -2235,10 +1986,6 @@ static inline int wake_priority_sleeper( return ret; } -DEFINE_PER_CPU(struct kernel_stat, kstat); - -EXPORT_PER_CPU_SYMBOL(kstat); - /* * We place interactive tasks back into the active array, if possible. * @@ -2256,155 +2003,13 @@ EXPORT_PER_CPU_SYMBOL(kstat); ((rq)->curr->static_prio > (rq)->best_expired_prio)) /* - * Do the virtual cpu time signal calculations. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -static inline void account_it_virt(struct task_struct * p, cputime_t cputime) -{ - cputime_t it_virt = p->it_virt_value; - - if (cputime_gt(it_virt, cputime_zero) && - cputime_gt(cputime, cputime_zero)) { - if (cputime_ge(cputime, it_virt)) { - it_virt = cputime_add(it_virt, p->it_virt_incr); - send_sig(SIGVTALRM, p, 1); - } - it_virt = cputime_sub(it_virt, cputime); - p->it_virt_value = it_virt; - } -} - -/* - * Do the virtual profiling signal calculations. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user and kernel space since the last update - */ -static void account_it_prof(struct task_struct *p, cputime_t cputime) -{ - cputime_t it_prof = p->it_prof_value; - - if (cputime_gt(it_prof, cputime_zero) && - cputime_gt(cputime, cputime_zero)) { - if (cputime_ge(cputime, it_prof)) { - it_prof = cputime_add(it_prof, p->it_prof_incr); - send_sig(SIGPROF, p, 1); - } - it_prof = cputime_sub(it_prof, cputime); - p->it_prof_value = it_prof; - } -} - -/* - * Check if the process went over its cputime resource limit after - * some cpu time got added to utime/stime. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user and kernel space since the last update - */ -static void check_rlimit(struct task_struct *p, cputime_t cputime) -{ - cputime_t total, tmp; - - total = cputime_add(p->utime, p->stime); - tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_cur); - if (unlikely(cputime_gt(total, tmp))) { - /* Send SIGXCPU every second. */ - tmp = cputime_sub(total, cputime); - if (cputime_to_secs(tmp) < cputime_to_secs(total)) - send_sig(SIGXCPU, p, 1); - /* and SIGKILL when we go over max.. */ - tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max); - if (cputime_gt(total, tmp)) - send_sig(SIGKILL, p, 1); - } -} - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time(struct task_struct *p, cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; - - p->utime = cputime_add(p->utime, cputime); - - /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */ - if (likely(p->signal)) - check_rlimit(p, cputime); - account_it_virt(p, cputime); - account_it_prof(p, cputime); - - /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - runqueue_t *rq = this_rq(); - cputime64_t tmp; - - p->stime = cputime_add(p->stime, cputime); - - /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */ - if (likely(p->signal)) - check_rlimit(p, cputime); - account_it_prof(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); -} - -/* - * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(struct task_struct *p, cputime_t steal) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t steal64 = cputime_to_cputime64(steal); - runqueue_t *rq = this_rq(); - - if (p == rq->idle) - cpustat->system = cputime64_add(cpustat->system, steal64); - else - cpustat->steal = cputime64_add(cpustat->steal, steal64); -} - -/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * * It also gets called by the fork code, when changing the parent's * timeslices. */ -void scheduler_tick(void) +static void ingo_scheduler_tick(void) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); @@ -2420,7 +2025,7 @@ void scheduler_tick(void) } /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->u.ingosched.array != rq->active) { set_tsk_need_resched(p); goto out; } @@ -2437,9 +2042,9 @@ void scheduler_tick(void) * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + if ((p->policy == SCHED_RR) && !--p->u.ingosched.time_slice) { + p->u.ingosched.time_slice = task_timeslice(p); + p->u.ingosched.first_time_slice = 0; set_tsk_need_resched(p); /* put it at the end of the queue: */ @@ -2447,12 +2052,12 @@ void scheduler_tick(void) } goto out_unlock; } - if (!--p->time_slice) { + if (!--p->u.ingosched.time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + p->u.ingosched.prio = effective_prio(p); + p->u.ingosched.time_slice = task_timeslice(p); + p->u.ingosched.first_time_slice = 0; if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; @@ -2480,9 +2085,9 @@ void scheduler_tick(void) * delta range with at least TIMESLICE_GRANULARITY to requeue. */ if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { + p->u.ingosched.time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->u.ingosched.time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->u.ingosched.array == rq->active)) { requeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2573,7 +2178,7 @@ static inline int dependent_sleeper(int BUG_ON(!array->nr_active); p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, - task_t, run_list); + task_t, u.ingosched.run_list); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); @@ -2587,7 +2192,7 @@ static inline int dependent_sleeper(int * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + if (((smt_curr->u.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) ret = 1; @@ -2597,7 +2202,7 @@ static inline int dependent_sleeper(int * or wake it up if it has been put to sleep for priority * reasons. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + if ((((p->u.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(smt_curr) || rt_task(p)) && smt_curr->mm && p->mm && !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) @@ -2619,42 +2224,10 @@ static inline int dependent_sleeper(int } #endif -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) - -void fastcall add_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON(((int)preempt_count() < 0)); - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); -} -EXPORT_SYMBOL(add_preempt_count); - -void fastcall sub_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON(val > preempt_count()); - /* - * Is the spinlock portion underflowing? - */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched ingo_schedule(void) { long *switch_count; task_t *prev, *next; @@ -2696,10 +2269,10 @@ need_resched_nonpreemptible: dump_stack(); } - schedstat_inc(rq, sched_cnt); + schedstat_inc(rq->sspcd, sched_cnt); now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; + if (likely(now - prev->u.ingosched.timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->u.ingosched.timestamp; else run_time = NS_MAX_SLEEP_AVG; @@ -2762,46 +2335,46 @@ go_idle: /* * Switch the active and expired arrays. */ - schedstat_inc(rq, sched_switch); + schedstat_inc(rq->sspcd, sched_switch); rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; rq->best_expired_prio = MAX_PRIO; } else - schedstat_inc(rq, sched_noswitch); + schedstat_inc(rq->sspcd, sched_noswitch); idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); + next = list_entry(queue->next, task_t, u.ingosched.run_list); - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; + if (!rt_task(next) && next->u.ingosched.activated > 0) { + unsigned long long delta = now - next->u.ingosched.timestamp; - if (next->activated == 1) + if (next->u.ingosched.activated == 1) delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - array = next->array; + array = next->u.ingosched.array; dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); + recalc_task_prio(next, next->u.ingosched.timestamp + delta); enqueue_task(next, array); } - next->activated = 0; + next->u.ingosched.activated = 0; switch_tasks: if (next == rq->idle) - schedstat_inc(rq, sched_goidle); + schedstat_inc(rq->sspcd, sched_goidle); prefetch(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; + prev->u.ingosched.sleep_avg -= run_time; + if ((long)prev->u.ingosched.sleep_avg <= 0) + prev->u.ingosched.sleep_avg = 0; + prev->u.ingosched.timestamp = prev->u.ingosched.last_ran = now; sched_info_switch(prev, next); if (likely(prev != next)) { - next->timestamp = now; + next->u.ingosched.timestamp = now; rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -2822,169 +2395,7 @@ switch_tasks: goto need_resched; } -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_PREEMPT -/* - * this is is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage void __sched preempt_schedule(void) -{ - struct thread_info *ti = current_thread_info(); -#ifdef CONFIG_PREEMPT_BKL - struct task_struct *task = current; - int saved_lock_depth; -#endif - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (unlikely(ti->preempt_count || irqs_disabled())) - return; - -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ -#ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; -#endif - schedule(); -#ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; -#endif - sub_preempt_count(PREEMPT_ACTIVE); - - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; -} - -EXPORT_SYMBOL(preempt_schedule); -#endif /* CONFIG_PREEMPT */ - -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) -{ - task_t *p = curr->task; - return try_to_wake_up(p, mode, sync); -} - -EXPORT_SYMBOL(default_wake_function); - -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) -{ - struct list_head *tmp, *next; - - list_for_each_safe(tmp, next, &q->task_list) { - wait_queue_t *curr; - unsigned flags; - curr = list_entry(tmp, wait_queue_t, task_list); - flags = curr->flags; - if (curr->func(curr, mode, sync, key) && - (flags & WQ_FLAG_EXCLUSIVE) && - !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - */ -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} - -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) -{ - __wake_up_common(q, mode, 1, 0, NULL); -} - -/** - * __wake_up - sync- wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - unsigned long flags; - int sync = 1; - - if (unlikely(!q)) - return; - - if (unlikely(!nr_exclusive)) - sync = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -void fastcall complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -void fastcall complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -void fastcall __sched wait_for_completion(struct completion *x) +static void __sched ingo_wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -3004,85 +2415,13 @@ void fastcall __sched wait_for_completio x->done--; spin_unlock_irq(&x->wait.lock); } -EXPORT_SYMBOL(wait_for_completion); - -#define SLEEP_ON_VAR \ - unsigned long flags; \ - wait_queue_t wait; \ - init_waitqueue_entry(&wait, current); - -#define SLEEP_ON_HEAD \ - spin_lock_irqsave(&q->lock,flags); \ - __add_wait_queue(q, &wait); \ - spin_unlock(&q->lock); - -#define SLEEP_ON_TAIL \ - spin_lock_irq(&q->lock); \ - __remove_wait_queue(q, &wait); \ - spin_unlock_irqrestore(&q->lock, flags); - -void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} - -EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +static void ingo_set_user_nice(task_t *p, long nice) { - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; -} - -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void fastcall __sched sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} - -EXPORT_SYMBOL(sleep_on); - -long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; -} - -EXPORT_SYMBOL(sleep_on_timeout); - -void set_user_nice(task_t *p, long nice) -{ - unsigned long flags; - prio_array_t *array; - runqueue_t *rq; - int old_prio, new_prio, delta; + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3101,15 +2440,15 @@ void set_user_nice(task_t *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; + array = p->u.ingosched.array; if (array) dequeue_task(p, array); - old_prio = p->prio; + old_prio = p->u.ingosched.prio; new_prio = NICE_TO_PRIO(nice); delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; + p->u.ingosched.prio += delta; if (array) { enqueue_task(p, array); @@ -3124,59 +2463,13 @@ out_unlock: task_rq_unlock(rq, &flags); } -EXPORT_SYMBOL(set_user_nice); - #ifdef CONFIG_KGDB -struct task_struct *kgdb_get_idle(int this_cpu) +static struct task_struct *ingo_kgdb_get_idle(int this_cpu) { return cpu_rq(this_cpu)->idle; } #endif -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -asmlinkage long sys_nice(int increment) -{ - int retval; - long nice; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - if (increment < 0) { - if (!capable(CAP_SYS_NICE)) - return -EPERM; - if (increment < -40) - increment = -40; - } - if (increment > 40) - increment = 40; - - nice = PRIO_TO_NICE(current->static_prio) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - /** * task_prio - return the priority value of a given task. * @p: the task in question. @@ -3185,16 +2478,16 @@ asmlinkage long sys_nice(int increment) * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ -int task_prio(const task_t *p) +static int ingo_task_prio(const task_t *p) { - return p->prio - MAX_RT_PRIO; + return p->u.ingosched.prio - MAX_RT_PRIO; } /** * task_nice - return the nice value of a given task. * @p: the task in question. */ -int task_nice(const task_t *p) +static int ingo_task_nice(const task_t *p) { return TASK_NICE(p); } @@ -3203,38 +2496,27 @@ int task_nice(const task_t *p) * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. */ -int idle_cpu(int cpu) +static int ingo_idle_cpu(int cpu) { return cpu_curr(cpu) == cpu_rq(cpu)->idle; } -EXPORT_SYMBOL_GPL(idle_cpu); - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - */ -static inline task_t *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_pid(pid) : current; -} - /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(p->u.ingosched.array); p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + p->u.ingosched.prio = MAX_USER_RT_PRIO-1 - p->rt_priority; else - p->prio = p->static_prio; + p->u.ingosched.prio = p->static_prio; } /* * setscheduler - change the scheduling policy and/or RT priority of a thread. */ -static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int ingo_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lp; int retval = -EINVAL; @@ -3303,11 +2585,11 @@ recheck: task_rq_unlock(rq, &flags); goto recheck; } - array = p->array; + array = p->u.ingosched.array; if (array) deactivate_task(p, task_rq(p)); retval = 0; - oldprio = p->prio; + oldprio = p->u.ingosched.prio; __setscheduler(p, policy, lp.sched_priority); if (array) { __activate_task(p, task_rq(p)); @@ -3317,7 +2599,7 @@ recheck: * this runqueue and our priority is higher than the current's */ if (task_running(rq, p)) { - if (p->prio > oldprio) + if (p->u.ingosched.prio > oldprio) resched_task(rq->curr); } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -3330,241 +2612,19 @@ out_nounlock: } /** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, - struct sched_param __user *param) -{ - return setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) -{ - return setscheduler(pid, -1, param); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - */ -asmlinkage long sys_sched_getscheduler(pid_t pid) -{ - int retval = -EINVAL; - task_t *p; - - if (pid < 0) - goto out_nounlock; - - retval = -ESRCH; - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy; - } - read_unlock(&tasklist_lock); - -out_nounlock: - return retval; -} - -/** - * sys_sched_getscheduler - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - */ -asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) -{ - struct sched_param lp; - int retval = -EINVAL; - task_t *p; - - if (!param || pid < 0) - goto out_nounlock; - - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - -out_nounlock: - return retval; - -out_unlock: - read_unlock(&tasklist_lock); - return retval; -} - -long sched_setaffinity(pid_t pid, cpumask_t new_mask) -{ - task_t *p; - int retval; - cpumask_t cpus_allowed; - - lock_cpu_hotplug(); - read_lock(&tasklist_lock); - - p = find_process_by_pid(pid); - if (!p) { - read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); - return -ESRCH; - } - - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ - get_task_struct(p); - read_unlock(&tasklist_lock); - - retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) - goto out_unlock; - - cpus_allowed = cpuset_cpus_allowed(p); - cpus_and(new_mask, new_mask, cpus_allowed); - retval = set_cpus_allowed(p, new_mask); - -out_unlock: - put_task_struct(p); - unlock_cpu_hotplug(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) -{ - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - */ -asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) -{ - cpumask_t new_mask; - int retval; - - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; - - return sched_setaffinity(pid, new_mask); -} - -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map = CPU_MASK_ALL; -cpumask_t cpu_possible_map = CPU_MASK_ALL; -#endif - -long sched_getaffinity(pid_t pid, cpumask_t *mask) -{ - int retval; - task_t *p; - - lock_cpu_hotplug(); - read_lock(&tasklist_lock); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = 0; - cpus_and(*mask, p->cpus_allowed, cpu_possible_map); - -out_unlock: - read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); - if (retval) - return retval; - - return 0; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - */ -asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) -{ - int ret; - cpumask_t mask; - - if (len < sizeof(cpumask_t)) - return -EINVAL; - - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; - - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; - - return sizeof(cpumask_t); -} - -/** * sys_sched_yield - yield the current processor to other threads. * * this function yields the current CPU by moving the calling thread * to the expired array. If there are no other threads running on this * CPU then this function will return. */ -asmlinkage long sys_sched_yield(void) +static long ingo_sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; + prio_array_t *array = current->u.ingosched.array; prio_array_t *target = rq->expired; - schedstat_inc(rq, yld_cnt); + schedstat_inc(rq->sspcd, yld_cnt); /* * We implement yielding by moving the task into the expired * queue. @@ -3575,12 +2635,12 @@ asmlinkage long sys_sched_yield(void) if (rt_task(current)) target = rq->active; - if (current->array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); + if (current->u.ingosched.array->nr_active == 1) { + schedstat_inc(rq->sspcd, yld_act_empty); if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); + schedstat_inc(rq->sspcd, yld_both_empty); } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); + schedstat_inc(rq->sspcd, yld_exp_empty); if (array != target) { dequeue_task(current, array); @@ -3604,86 +2664,6 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline void __cond_resched(void) -{ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); -} - -int __sched cond_resched(void) -{ - if (need_resched()) { - __cond_resched(); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched); - -/* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int cond_resched_lock(spinlock_t * lock) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) - if (lock->break_lock) { - lock->break_lock = 0; - spin_unlock(lock); - cpu_relax(); - spin_lock(lock); - } -#endif - if (need_resched()) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); - spin_lock(lock); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched_lock); - -int __sched cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (need_resched()) { - __local_bh_enable(); - __cond_resched(); - local_bh_disable(); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched_softirq); - - -/** - * yield - yield the current processor to other threads. - * - * this is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} - -EXPORT_SYMBOL(yield); - /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. @@ -3691,7 +2671,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void __sched io_schedule(void) +static void __sched ingo_io_schedule(void) { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); @@ -3700,9 +2680,7 @@ void __sched io_schedule(void) atomic_dec(&rq->nr_iowait); } -EXPORT_SYMBOL(io_schedule); - -long __sched io_schedule_timeout(long timeout) +static long __sched ingo_io_schedule_timeout(long timeout) { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); long ret; @@ -3714,51 +2692,6 @@ long __sched io_schedule_timeout(long ti } /** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. - */ -asmlinkage long sys_sched_get_priority_max(int policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. - */ -asmlinkage long sys_sched_get_priority_min(int policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - ret = 0; - } - return ret; -} - -/** * sys_sched_rr_get_interval - return the default timeslice of a process. * @pid: pid of the process. * @interval: userspace pointer to the timeslice value. @@ -3766,8 +2699,8 @@ asmlinkage long sys_sched_get_priority_m * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -asmlinkage -long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +static long +ingo_sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { int retval = -EINVAL; struct timespec t; @@ -3797,112 +2730,14 @@ out_unlock: return retval; } -static inline struct task_struct *eldest_child(struct task_struct *p) -{ - if (list_empty(&p->children)) return NULL; - return list_entry(p->children.next,struct task_struct,sibling); -} - -static inline struct task_struct *older_sibling(struct task_struct *p) -{ - if (p->sibling.prev==&p->parent->children) return NULL; - return list_entry(p->sibling.prev,struct task_struct,sibling); -} - -static inline struct task_struct *younger_sibling(struct task_struct *p) -{ - if (p->sibling.next==&p->parent->children) return NULL; - return list_entry(p->sibling.next,struct task_struct,sibling); -} - -static void show_task(task_t * p) -{ - task_t *relative; - unsigned state; - unsigned long free = 0; - static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; - - printk("%-13.13s ", p->comm); - state = p->state ? __ffs(p->state) + 1 : 0; - if (state < ARRAY_SIZE(stat_nam)) - printk(stat_nam[state]); - else - printk("?"); -#if (BITS_PER_LONG == 32) - if (state == TASK_RUNNING) - printk(" running "); - else - printk(" %08lX ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(" running task "); - else - printk(" %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long * n = (unsigned long *) (p->thread_info+1); - while (!*n) - n++; - free = (unsigned long) n - (unsigned long)(p->thread_info+1); - } -#endif - printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); - if ((relative = eldest_child(p))) - printk("%5d ", relative->pid); - else - printk(" "); - if ((relative = younger_sibling(p))) - printk("%7d", relative->pid); - else - printk(" "); - if ((relative = older_sibling(p))) - printk(" %5d", relative->pid); - else - printk(" "); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); - - if (state != TASK_RUNNING) - show_stack(p, NULL); -} - -void show_state(void) -{ - task_t *g, *p; - -#if (BITS_PER_LONG == 32) - printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); -#else - printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); -#endif - read_lock(&tasklist_lock); - do_each_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: - */ - touch_nmi_watchdog(); - show_task(p); - } while_each_thread(g, p); - - read_unlock(&tasklist_lock); -} - -void __devinit init_idle(task_t *idle, int cpu) +static void __devinit ingo_init_idle(task_t *idle, int cpu) { runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; - idle->array = NULL; - idle->prio = MAX_PRIO; + idle->u.ingosched.sleep_avg = 0; + idle->u.ingosched.array = NULL; + idle->u.ingosched.prio = MAX_PRIO; idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); @@ -3919,15 +2754,6 @@ void __devinit init_idle(task_t *idle, i #endif } -/* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. - */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; - #ifdef CONFIG_SMP /* * This is how migration works: @@ -3954,7 +2780,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(task_t *p, cpumask_t new_mask) +static int ingo_set_cpus_allowed(task_t *p, cpumask_t new_mask) { unsigned long flags; int ret = 0; @@ -3987,8 +2813,6 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); - /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() @@ -4017,15 +2841,16 @@ static void __migrate_task(struct task_s goto out; set_task_cpu(p, dest_cpu); - if (p->array) { + if (p->u.ingosched.array) { /* * Sync timestamp with rq_dest's before activating. * The same thing could be achieved by doing this step * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->u.ingosched.timestamp = p->u.ingosched.timestamp - + rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -4165,7 +2990,7 @@ static void migrate_live_tasks(int src_c * It does so by boosting its priority to highest possible and adding it to * the _front_ of runqueue. Used by CPU offline code. */ -void sched_idle_next(void) +static void ingo_sched_idle_next(void) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); @@ -4223,7 +3048,7 @@ static void migrate_dead_tasks(unsigned while (!list_empty(list)) migrate_dead(dead_cpu, list_entry(list->next, task_t, - run_list)); + u.ingosched.run_list)); } } } @@ -4306,7 +3131,7 @@ static struct notifier_block __devinitda .priority = 10 }; -int __init migration_init(void) +static int __init ingo_migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); /* Start one for boot CPU. */ @@ -4318,11 +3143,38 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP +#ifdef CONFIG_SCHEDSTATS +static void ingo_show_schedstat_sd(struct seq_file *seq, int cpu) +{ + enum idle_type itype; + struct sched_domain *sd; + int dcnt = 0; + + for_each_domain(cpu, sd) { + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu", + sd->lb_cnt[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu\n", + sd->sbe_pushed, sd->sbe_attempts, + sd->ttwu_wake_affine, sd->ttwu_wake_balance); + } +} +#endif + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +static void __devinit ingo_cpu_attach_domain(struct sched_domain *sd, int cpu) { migration_req_t req; unsigned long flags; @@ -4349,9 +3201,6 @@ void __devinit cpu_attach_domain(struct } } -/* cpus with isolated domains */ -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { @@ -4366,52 +3215,6 @@ static int __init isolated_cpu_setup(cha __setup ("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -void __devinit init_sched_build_groups(struct sched_group groups[], - cpumask_t span, int (*group_fn)(int cpu)) -{ - struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; - int i; - - for_each_cpu_mask(i, span) { - int group = group_fn(i); - struct sched_group *sg = &groups[group]; - int j; - - if (cpu_isset(i, covered)) - continue; - - sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; - - for_each_cpu_mask(j, span) { - if (group_fn(j) != group) - continue; - - cpu_set(j, covered); - cpu_set(j, sg->cpumask); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - - #ifdef ARCH_HAS_SCHED_DOMAIN extern void __devinit arch_init_sched_domains(void); extern void __devinit arch_destroy_sched_domains(void); @@ -4740,7 +3543,7 @@ static int update_sched_domains(struct n } #endif -void __init sched_init_smp(void) +static void __init ingo_sched_init_smp(void) { lock_cpu_hotplug(); arch_init_sched_domains(); @@ -4750,25 +3553,21 @@ void __init sched_init_smp(void) hotcpu_notifier(update_sched_domains, 0); } #else -void __init sched_init_smp(void) +static void __init ingo_sched_init_smp(void) { } #endif /* CONFIG_SMP */ -int in_sched_functions(unsigned long addr) -{ - /* Linker adds these: start and end of __sched functions */ - extern char __sched_text_start[], __sched_text_end[]; - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -void __init sched_init(void) +static void __init ingo_sched_init(void) { runqueue_t *rq; int i, j, k; + init_task.u.ingosched.prio = MAX_PRIO - 20; + init_task.static_prio = MAX_PRIO - 20; + INIT_LIST_HEAD(&init_task.u.ingosched.run_list); + init_task.u.ingosched.time_slice = HZ; + for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; @@ -4797,6 +3596,9 @@ void __init sched_init(void) // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } +#ifdef CONFIG_SCHEDSTATS + rq->sspcd = cpu_sspcd(i); +#endif } /* @@ -4814,28 +3616,6 @@ void __init sched_init(void) init_idle(current, smp_processor_id()); } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) -{ -#if defined(in_atomic) - static unsigned long prev_jiffy; /* ratelimiting */ - - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "Debug: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - dump_stack(); - } -#endif -} -EXPORT_SYMBOL(__might_sleep); -#endif - #if defined(CONFIG_DEBUG_KERNEL)&&defined(CONFIG_SYSCTL)&&defined(CONFIG_SMP) static struct ctl_table sd_ctl_dir[] = { {1, "sched_domain", NULL, 0, 0755, NULL, }, @@ -4925,7 +3705,7 @@ static ctl_table *sd_alloc_ctl_cpu_table } static struct ctl_table_header *sd_sysctl_header; -void init_sched_domain_sysctl() +void ingo_init_sched_domain_sysctl(void) { int i, cpu_num = num_online_cpus(); char buf[32]; @@ -4943,7 +3723,7 @@ void init_sched_domain_sysctl() sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0); } -void destroy_sched_domain_sysctl() +static void ingo_destroy_sched_domain_sysctl(void) { int cpu, cpu_num = num_online_cpus(); struct sched_domain *sd; @@ -4965,16 +3745,21 @@ void destroy_sched_domain_sysctl() kfree(root); } #else -void init_sched_domain_sysctl() +static void ingo_init_sched_domain_sysctl(void) { } -void destroy_sched_domain_sysctl() +static void ingo_destroy_sched_domain_sysctl(void) { } #endif +static int ingo_is_idle_task(const task_t *p) +{ + return p == task_rq(p)->idle; +} + #ifdef CONFIG_MAGIC_SYSRQ -void normalize_rt_tasks(void) +void ingo_normalize_rt_tasks(void) { struct task_struct *p; prio_array_t *array; @@ -4988,7 +3773,7 @@ void normalize_rt_tasks(void) rq = task_rq_lock(p, &flags); - array = p->array; + array = p->u.ingosched.array; if (array) deactivate_task(p, task_rq(p)); __setscheduler(p, SCHED_NORMAL, 0); @@ -5003,3 +3788,59 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +struct sched_drv ingo_sched_drv = { + .task_cpu = common_task_cpu, + .set_task_cpu = common_set_task_cpu, + .init_sched_domain_sysctl = ingo_init_sched_domain_sysctl, + .destroy_sched_domain_sysctl = ingo_destroy_sched_domain_sysctl, + .cpusched_name = "ingosched", + .rt_task = ingo_rt_task, + .wait_for_completion = ingo_wait_for_completion, + .io_schedule = ingo_io_schedule, + .io_schedule_timeout = ingo_io_schedule_timeout, + .set_oom_timeslice = ingo_set_oom_timeslice, + .nr_running = ingo_nr_running, + .nr_uninterruptible = ingo_nr_uninterruptible, + .nr_context_switches = ingo_nr_context_switches, + .nr_iowait = ingo_nr_iowait, + .nr_iowait_task_cpu = ingo_nr_iowait_task_cpu, + .idle_cpu = ingo_idle_cpu, + .init_idle = ingo_init_idle, + .exit = ingo_sched_exit, + .fork = ingo_sched_fork, + .init = ingo_sched_init, + .init_smp = ingo_sched_init_smp, + .schedule = ingo_schedule, + .tick = ingo_scheduler_tick, + .tail = ingo_schedule_tail, + .setscheduler = ingo_setscheduler, + .set_user_nice = ingo_set_user_nice, + .rr_get_interval = ingo_sys_sched_rr_get_interval, + .yield = ingo_sys_sched_yield, + .is_idle_task = ingo_is_idle_task, + .task_curr = ingo_task_curr, + .task_nice = ingo_task_nice, + .task_prio = ingo_task_prio, + .try_to_wake_up = ingo_try_to_wake_up, + .wake_up_new_task = ingo_wake_up_new_task, +#ifdef CONFIG_SMP + .migration_init = ingo_migration_init, + .exec = ingo_sched_exec, + .set_cpus_allowed = ingo_set_cpus_allowed, + .wait_task_inactive = ingo_wait_task_inactive, + .cpu_attach_domain = ingo_cpu_attach_domain, +#ifdef CONFIG_HOTPLUG_CPU + .sched_idle_next = ingo_sched_idle_next, +#endif +#ifdef CONFIG_SCHEDSTATS + .show_schedstat_sd = ingo_show_schedstat_sd, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_tasks = ingo_normalize_rt_tasks, +#endif +#ifdef CONFIG_KGDB + .kgdb_get_idle = ingo_kgdb_get_idle, +#endif +}; Index: linux-2.6.10-rc1-mm5/kernel/scheduler.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/scheduler.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/scheduler.c 2004-11-11 22:08:35.000000000 +1100 @@ -0,0 +1,1552 @@ +/* + * kernel/scheduler.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * Modular cpu scheduler infrastructure by Con Kolivas based on + * work by William Lee Irwin III. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DEFINE_PER_CPU(struct kernel_stat, kstat); +EXPORT_PER_CPU_SYMBOL(kstat); + +unsigned long nr_iowait_task_cpu(const task_t *p) +{ + return scheduler->nr_iowait_task_cpu(p); +} + +/* + * Do the virtual cpu time signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + */ +static inline void account_it_virt(struct task_struct * p, cputime_t cputime) +{ + cputime_t it_virt = p->it_virt_value; + + if (cputime_gt(it_virt, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_virt)) { + it_virt = cputime_add(it_virt, p->it_virt_incr); + send_sig(SIGVTALRM, p, 1); + } + it_virt = cputime_sub(it_virt, cputime); + p->it_virt_value = it_virt; + } +} + +/* + * Do the virtual profiling signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void account_it_prof(struct task_struct *p, cputime_t cputime) +{ + cputime_t it_prof = p->it_prof_value; + + if (cputime_gt(it_prof, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_prof)) { + it_prof = cputime_add(it_prof, p->it_prof_incr); + send_sig(SIGPROF, p, 1); + } + it_prof = cputime_sub(it_prof, cputime); + p->it_prof_value = it_prof; + } +} + +/* + * Check if the process went over its cputime resource limit after + * some cpu time got added to utime/stime. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void check_rlimit(struct task_struct *p, cputime_t cputime) +{ + cputime_t total, tmp; + + total = cputime_add(p->utime, p->stime); + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_cur); + if (unlikely(cputime_gt(total, tmp))) { + /* Send SIGXCPU every second. */ + tmp = cputime_sub(total, cputime); + if (cputime_to_secs(tmp) < cputime_to_secs(total)) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max); + if (cputime_gt(total, tmp)) + send_sig(SIGKILL, p, 1); + } +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in user space since the last update + */ +void account_user_time(struct task_struct *p, cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->utime = cputime_add(p->utime, cputime); + + /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */ + if (likely(p->signal)) + check_rlimit(p, cputime); + account_it_virt(p, cputime); + account_it_prof(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (task_nice(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); + + /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */ + if (likely(p->signal)) + check_rlimit(p, cputime); + account_it_prof(p, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (!is_idle_task(p)) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (nr_iowait_task_cpu(p) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); +} + +/* + * Account for involuntary wait time. + * @p: the process from which the cpu time has been stolen + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(struct task_struct *p, cputime_t steal) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t steal64 = cputime_to_cputime64(steal); + + if (is_idle_task(p)) + cpustat->system = cputime64_add(cpustat->system, steal64); + else + cpustat->steal = cputime64_add(cpustat->steal, steal64); +} + +unsigned int task_cpu(const struct task_struct *p); + +void set_task_cpu(struct task_struct *p, unsigned int cpu); + +#ifdef CONFIG_SMP +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + */ +void kick_process(task_t *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Wrappers for p->thread_info->cpu access. No-op on UP. + */ +unsigned int common_task_cpu(const struct task_struct *p) +{ + return p->thread_info->cpu; +} + +void common_set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->thread_info->cpu = cpu; +} + +#else + +unsigned int common_task_cpu(const struct task_struct *p) +{ + return 0; +} + +void common_set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PREEMPT +#ifdef CONFIG_DEBUG_PREEMPT + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + schedule(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void fastcall __sched sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "Debug: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + dump_stack(); + } +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +asmlinkage long sys_nice(int increment) +{ + int retval; + long nice; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < 0) { + if (!capable(CAP_SYS_NICE)) + return -EPERM; + if (increment < -40) + increment = -40; + } + if (increment > 40) + increment = 40; + + nice = task_nice(current) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +int setscheduler(pid_t pid, int policy, struct sched_param __user *param); + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + return setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +{ + return setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +asmlinkage long sys_sched_getscheduler(pid_t pid) +{ + int retval = -EINVAL; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +long sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + task_t *p; + int retval; + cpumask_t cpus_allowed; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + cpus_allowed = cpuset_cpus_allowed(p); + cpus_and(new_mask, new_mask, cpus_allowed); + retval = set_cpus_allowed(p, new_mask); + +out_unlock: + put_task_struct(p); + unlock_cpu_hotplug(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + cpumask_t *new_mask) +{ + if (len < sizeof(cpumask_t)) { + memset(new_mask, 0, sizeof(cpumask_t)); + } else if (len > sizeof(cpumask_t)) { + len = sizeof(cpumask_t); + } + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + + retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (retval) + return retval; + + return sched_setaffinity(pid, new_mask); +} + +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ + +cpumask_t cpu_present_map; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP +cpumask_t cpu_online_map = CPU_MASK_ALL; +cpumask_t cpu_possible_map = CPU_MASK_ALL; +#endif + +long sched_getaffinity(pid_t pid, cpumask_t *mask) +{ + int retval; + task_t *p; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + cpus_and(*mask, p->cpus_allowed, cpu_possible_map); + +out_unlock: + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + if (retval) + return retval; + + return 0; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + int ret; + cpumask_t mask; + + if (len < sizeof(cpumask_t)) + return -EINVAL; + + ret = sched_getaffinity(pid, &mask); + if (ret < 0) + return ret; + + if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) + return -EFAULT; + + return sizeof(cpumask_t); +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_max(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_min(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + ret = 0; + } + return ret; +} + +static inline void __cond_resched(void) +{ + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched cond_resched(void) +{ + if (need_resched()) { + __cond_resched(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t * lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock(lock); + cpu_relax(); + spin_lock(lock); + } +#endif + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + spin_lock(lock); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched()) { + __local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_softirq); + +/** + * yield - yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} + +EXPORT_SYMBOL(yield); + +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +static void show_task(task_t * p) +{ + task_t *relative; + unsigned state; + unsigned long free = 0; + static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; + + printk("%-13.13s ", p->comm); + state = p->state ? __ffs(p->state) + 1 : 0; + if (state < ARRAY_SIZE(stat_nam)) + printk(stat_nam[state]); + else + printk("?"); +#if (BITS_PER_LONG == 32) + if (state == TASK_RUNNING) + printk(" running "); + else + printk(" %08lX ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(" running task "); + else + printk(" %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long * n = (unsigned long *) (p->thread_info+1); + while (!*n) + n++; + free = (unsigned long) n - (unsigned long)(p->thread_info+1); + } +#endif + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); + else + printk(" "); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); + else + printk(" "); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); + else + printk(" "); + if (!p->mm) + printk(" (L-TLB)\n"); + else + printk(" (NOTLB)\n"); + + if (state != TASK_RUNNING) + show_stack(p, NULL); +} + +void show_state(void) +{ + task_t *g, *p; + +#if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#else + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); +} + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_MASK_NONE. + */ +cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + +int in_sched_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __sched functions */ + extern char __sched_text_start[], __sched_text_end[]; + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +int try_to_wake_up(task_t *task, unsigned state, int sync); + +int fastcall wake_up_state(task_t *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +int fastcall wake_up_process(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(wake_up_process); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, &q->task_list) { + wait_queue_t *curr; + unsigned flags; + curr = list_entry(tmp, wait_queue_t, task_list); + flags = curr->flags; + if (curr->func(curr, mode, sync, key) && + (flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + */ +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +{ + task_t *p = curr->task; + return try_to_wake_up(p, mode, sync); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} + +/** + * __wake_up - sync- wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + int sync = 1; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + sync = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +void fastcall complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +void fastcall complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +#ifdef CONFIG_SMP +/* cpus with isolated domains */ +cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; + +/* + * init_sched_build_groups takes an array of groups, the cpumask we wish + * to span, and a pointer to a function which identifies what group a CPU + * belongs to. The return value of group_fn must be a valid index into the + * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we + * keep track of groups covered with a cpumask_t). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +void __devinit init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; + + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; + int j; + + if (cpu_isset(i, covered)) + continue; + + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; + + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; + + cpu_set(j, covered); + cpu_set(j, sg->cpumask); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} +#endif + +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 10 + +DEFINE_PER_CPU(struct schedstat_per_cpu_data, schedstat_pcd_data); + +#ifdef CONFIG_SMP +static void show_schedstat_sd(struct seq_file *seq, int cpu); +#else +static inline void show_schedstat_sd(struct seq_file *seq, int cpu) +{ +} +#endif + +int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + enum idle_type itype; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + schedstat_pcd_t *sspcd = cpu_sspcd(cpu); + + /* schedstat_per_cpu-specific stats */ + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + cpu, sspcd->yld_both_empty, + sspcd->yld_act_empty, sspcd->yld_exp_empty, + sspcd->yld_cnt, sspcd->sched_noswitch, + sspcd->sched_switch, sspcd->sched_cnt, sspcd->sched_goidle, + sspcd->alb_cnt, sspcd->alb_gained, sspcd->alb_lost, + sspcd->alb_failed, + sspcd->ttwu_cnt, sspcd->ttwu_moved, sspcd->ttwu_attempts, + sspcd->wunt_cnt, sspcd->wunt_moved, + sspcd->smt_cnt, sspcd->sbe_cnt, sspcd->rq_sched_info.cpu_time, + sspcd->rq_sched_info.run_delay, sspcd->rq_sched_info.pcnt); + + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) + seq_printf(seq, " %lu %lu", sspcd->pt_gained[itype], + sspcd->pt_lost[itype]); + seq_printf(seq, "\n"); + + /* domain-specific stats */ + show_schedstat_sd(seq, cpu); + } + return 0; +} + +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies, diff = 0; + schedstat_pcd_t *sspcd = task_sspcd(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!sspcd) + return; + + sspcd->rq_sched_info.run_delay += diff; + sspcd->rq_sched_info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + schedstat_pcd_t *sspcd = task_sspcd(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (sspcd) + sspcd->rq_sched_info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +void sched_info_switch(task_t *prev, task_t *next) +{ + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (!is_idle_task(prev)) + sched_info_depart(prev); + + if (!is_idle_task(next)) + sched_info_arrive(next); +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +extern struct sched_drv ingo_sched_drv; +extern struct sched_drv sc_sched_drv; +extern struct sched_drv ms_sched_drv; + +struct sched_drv *scheduler = +#if defined(CONFIG_CPUSCHED_DEFAULT_INGO) + &ingo_sched_drv; +#elif defined(CONFIG_CPUSCHED_DEFAULT_STAIRCASE) + &sc_sched_drv; +#elif defined(CONFIG_CPUSCHED_DEFAULT_MINISCHED) + &ms_sched_drv; +#else + NULL; +#error "You must have at least 1 cpu scheduler selected" +#endif + +static int __init scheduler_setup(char *str) +{ + struct sched_drv *chosen_sched = NULL; +#if defined(CONFIG_CPUSCHED_INGO) + if (!strcmp(str, ingo_sched_drv.cpusched_name)) + chosen_sched = &ingo_sched_drv; +#endif +#if defined(CONFIG_CPUSCHED_STAIRCASE) + if (!strcmp(str, sc_sched_drv.cpusched_name)) + chosen_sched = &sc_sched_drv; +#endif +#if defined(CONFIG_CPUSCHED_MINISCHED) + if (!strcmp(str, ms_sched_drv.cpusched_name)) + chosen_sched = &ms_sched_drv; +#endif + if (chosen_sched && chosen_sched != scheduler) { + /* + * A different cpu scheduler from the default has been + * chosen. We need to reinit the scheduler. Set the scheduler + * pointer to the new chosen scheduler. + */ + scheduler = chosen_sched; + /* Get a fresh init_task from the saved one */ + init_task = base_init_task; + /* Repeat sched_init sequence */ + sched_init(); + preempt_disable(); + } + return 1; +} + +__setup ("cpusched=", scheduler_setup); + +unsigned int task_cpu(const struct task_struct *p) +{ + return scheduler->task_cpu(p); +} + +void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + scheduler->set_task_cpu(p, cpu); +} + +void init_sched_domain_sysctl(void) +{ + scheduler->init_sched_domain_sysctl(); +} + +void destroy_sched_domain_sysctl(void) +{ + scheduler->destroy_sched_domain_sysctl(); +} + +void fastcall __sched wait_for_completion(struct completion *x) +{ + scheduler->wait_for_completion(x); +} +EXPORT_SYMBOL(wait_for_completion); + +void sched_idle_next(void) +{ + scheduler->sched_idle_next(); +} + +void __sched io_schedule(void) +{ + scheduler->io_schedule(); +} +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + return scheduler->io_schedule_timeout(timeout); +} + +unsigned long nr_running(void) +{ + return scheduler->nr_running(); +} + +unsigned long nr_uninterruptible(void) +{ + return scheduler->nr_uninterruptible(); +} + +unsigned long long nr_context_switches(void) +{ + return scheduler->nr_context_switches(); +} + +unsigned long nr_iowait(void) +{ + return scheduler->nr_iowait(); +} + +int rt_task(task_t *task) +{ + return scheduler->rt_task(task); +} + +int idle_cpu(int cpu) +{ + return scheduler->idle_cpu(cpu); +} +EXPORT_SYMBOL_GPL(idle_cpu); + +void __devinit init_idle(task_t *task, int cpu) +{ + scheduler->init_idle(task, cpu); +} + +void __init sched_init(void) +{ + scheduler->init(); +} + +void __init sched_init_smp(void) +{ + scheduler->init_smp(); +} + +asmlinkage void schedule(void) +{ + scheduler->schedule(); +} +EXPORT_SYMBOL(schedule); + +void scheduler_tick(void) +{ + scheduler->tick(); +} + +#ifdef CONFIG_SMP +int migration_init(void) +{ + return scheduler->migration_init(); +} + +int set_cpus_allowed(task_t *task, cpumask_t cpus) +{ + return scheduler->set_cpus_allowed(task, cpus); +} +EXPORT_SYMBOL_GPL(set_cpus_allowed); + +void wait_task_inactive(task_t * task) +{ + scheduler->wait_task_inactive(task); +} + +void sched_exec(void) +{ + scheduler->exec(); +} + +void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + scheduler->cpu_attach_domain(sd, cpu); +} +#endif + +void set_user_nice(task_t *task, long nice) +{ + scheduler->set_user_nice(task, nice); +} +EXPORT_SYMBOL(set_user_nice); + +void set_oom_timeslice(task_t *p) +{ + scheduler->set_oom_timeslice(p); +} + +asmlinkage +long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + return scheduler->rr_get_interval(pid, interval); +} + +asmlinkage long sys_sched_yield(void) +{ + return scheduler->yield(); +} + +int setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + return scheduler->setscheduler(pid, policy, param); +} + +int is_idle_task(const task_t *task) +{ + return scheduler->is_idle_task(task); +} + +int task_curr(const task_t *task) +{ + return scheduler->task_curr(task); +} + +int task_nice(const task_t *task) +{ + return scheduler->task_nice(task); +} + +int task_prio(const task_t *task) +{ + return scheduler->task_prio(task); +} + +int try_to_wake_up(task_t *task, unsigned state, int sync) +{ + return scheduler->try_to_wake_up(task, state, sync); +} + +void fastcall wake_up_new_task(task_t *task, unsigned long flags) +{ + scheduler->wake_up_new_task(task, flags); +} + +void fastcall sched_fork(task_t *task) +{ + scheduler->fork(task); +} + +void fastcall sched_exit(task_t *task) +{ + scheduler->exit(task); +} + +asmlinkage void schedule_tail(task_t *task) +{ + scheduler->tail(task); +} + +#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SMP +static void show_schedstat_sd(struct seq_file *seq, int cpu) +{ + scheduler->show_schedstat_sd(seq, cpu); +} +#endif +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +void normalize_rt_tasks(void) +{ + scheduler->normalize_rt_tasks(); +} +#endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_KGDB +struct task_struct *kgdb_get_idle(int this_cpu) +{ + return scheduler->kgdb_get_idle(this_cpu); +} +#endif Index: linux-2.6.10-rc1-mm5/kernel/staircase.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/staircase.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/staircase.c 2004-11-11 22:08:34.000000000 +1100 @@ -0,0 +1,3611 @@ +/* + * kernel/staircase.c + * + * This is the staircase cpu scheduler + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-28 New staircase scheduling policy by Con Kolivas with help + * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Unique staircase process flags used by scheduler. + */ +#define SF_FORKED 0x00000001 /* I have just forked */ +#define SF_YIELDED 0x00000002 /* I have just yielded */ +#define SF_UISLEEP 0x00000004 /* Uninterruptible sleep */ + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->u.scsched.prio value means higher priority. + */ + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ + +int sched_compute = 0; +/* + *This is the time all tasks within the same priority round robin. + *compute setting is reserved for dedicated computational scheduling + *and has ten times larger intervals. + */ +#define _RR_INTERVAL ((10 * HZ / 1000) ? : 1) +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 9 * sched_compute)) + +#define task_hot(p, now, sd) ((long long) \ + ((now) - (p)->u.scsched.timestamp) < \ + (long long) (sd)->cache_hot_time) + +/* + * These are the runqueue data structures: + */ + +typedef struct runqueue runqueue_t; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long cpu_load; +#endif + unsigned long long nr_switches; + unsigned long nr_uninterruptible; + unsigned long long timestamp_last_tick; + unsigned int cache_ticks, preempted; + task_t *curr, *idle; + struct mm_struct *prev_mm; + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO+1)]; + struct list_head queue[MAX_PRIO + 1]; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHEDSTATS + schedstat_pcd_t *sspcd; +#endif +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +#define for_each_domain(cpu, domain) \ + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +static int sc_rt_task(const task_t *p) +{ + return (unlikely((p)->u.scsched.prio < MAX_RT_PRIO)); +} + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while (0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static runqueue_t *this_rq_lock(void) + __acquires(rq->lock) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +static inline void rq_unlock(runqueue_t *rq) + __releases(rq->lock) +{ + spin_unlock_irq(&rq->lock); +} + +#ifdef CONFIG_SCHED_SMT +static int cpu_and_siblings_are_idle(int cpu) +{ + int sib; + for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { + if (idle_cpu(sib)) + continue; + return 0; + } + + return 1; +} +#else +#define cpu_and_siblings_are_idle(A) idle_cpu(A) +#endif + +/* + * Get nanosecond clock difference without overflowing unsigned long. + */ +static unsigned long ns_diff(unsigned long long v1, unsigned long long v2) +{ + unsigned long long vdiff; + if (unlikely(v1 < v2)) + /* + * Rarely the clock goes backwards. There should always be + * a positive difference so return 1. + */ + vdiff = 1; + else + vdiff = v1 - v2; + if (vdiff > (1 << 31)) + vdiff = 1 << 31; + return (unsigned long)vdiff; +} + +static inline int task_queued(task_t *task) +{ + return !list_empty(&task->u.scsched.run_list); +} + +/* + * Adding/removing a task to/from a runqueue: + */ +static void dequeue_task(struct task_struct *p, runqueue_t *rq) +{ + list_del_init(&p->u.scsched.run_list); + if (list_empty(rq->queue + p->u.scsched.prio)) + __clear_bit(p->u.scsched.prio, rq->bitmap); + p->u.scsched.ns_debit = 0; +} + +static void enqueue_task(struct task_struct *p, runqueue_t *rq) +{ + list_add_tail(&p->u.scsched.run_list, rq->queue + p->u.scsched.prio); + __set_bit(p->u.scsched.prio, rq->bitmap); +} + +static void requeue_task(struct task_struct *p, runqueue_t *rq) +{ + list_move_tail(&p->u.scsched.run_list, rq->queue + p->u.scsched.prio); +} + +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static void enqueue_task_head(struct task_struct *p, runqueue_t *rq) +{ + list_add(&p->u.scsched.run_list, rq->queue + p->u.scsched.prio); + __set_bit(p->u.scsched.prio, rq->bitmap); +} + +/* + * __activate_task - move a task to the runqueue. + */ +static void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq); + rq->nr_running++; +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq); + rq->nr_running++; +} + +/* + * burst - extra intervals an interactive task can run for at best priority + * instead of descending priorities. + */ +static unsigned int burst(task_t *p) +{ + if (likely(!rt_task(p))) { + unsigned int task_user_prio = TASK_USER_PRIO(p); + return 39 - task_user_prio; + } else + return p->u.scsched.burst; +} + +static void inc_burst(task_t *p) +{ + unsigned int best_burst; + best_burst = burst(p); + if (p->u.scsched.burst < best_burst) + p->u.scsched.burst++; +} + +static void dec_burst(task_t *p) +{ + if (p->u.scsched.burst) + p->u.scsched.burst--; +} + +/* + * slice - the duration a task runs before getting requeued at it's best + * priority and has it's burst decremented. + */ +static unsigned int slice(task_t *p) +{ + unsigned int slice = RR_INTERVAL(); + if (likely(!rt_task(p))) + slice += burst(p) * RR_INTERVAL(); + return slice; +} + +/* + * sched_interactive - sysctl which allows interactive tasks to have bursts + */ +int sched_interactive = 1; + +static unsigned int rr_interval(task_t * p) +{ + unsigned int rr_interval = RR_INTERVAL(); + int nice = TASK_NICE(p); + + if (nice < 0 && !rt_task(p)) + rr_interval += -(nice); + return rr_interval; +} + +static void sc_set_oom_timeslice(task_t *p) +{ + p->u.scsched.slice = slice(p) * 10; + p->u.scsched.time_slice = p->u.scsched.slice; +} + +/* + * effective_prio - dynamic priority dependent on burst. + * The priority normally decreases by one each RR_INTERVAL. + * As the burst increases the priority stays at the top "stair" or + * priority for longer. + */ +static int effective_prio(task_t *p) +{ + int prio; + unsigned int full_slice, used_slice, first_slice; + unsigned int best_burst, rr; + if (rt_task(p)) + return p->u.scsched.prio; + + best_burst = burst(p); + full_slice = slice(p); + rr = rr_interval(p); + used_slice = full_slice - p->u.scsched.slice; + if (p->u.scsched.burst > best_burst) + p->u.scsched.burst = best_burst; + first_slice = rr; + if (sched_interactive && !sched_compute && p->mm) + first_slice *= (p->u.scsched.burst + 1); + prio = MAX_PRIO - 1 - best_burst; + + if (used_slice < first_slice) + return prio; + prio += 1 + (used_slice - first_slice) / rr; + if (prio > MAX_PRIO - 1) { + prio = MAX_PRIO - 1; + p->u.scsched.totalrun = 0; + } + return prio; +} + +/* + * recalc_task_prio - this checks for tasks that run ultra short timeslices + * or have just forked a thread/process and make them continue their old + * slice instead of starting a new one at high priority. + */ +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long sleep_time = ns_diff(now, p->u.scsched.timestamp); + unsigned int rr = rr_interval(p); + unsigned int best_burst = burst(p); + unsigned int minrun = rr * (p->u.scsched.burst + 1) / (best_burst + 1) ? : 1; + if (p->u.scsched.sflags & SF_FORKED || + (NS_TO_JIFFIES(p->u.scsched.runtime + sleep_time) < minrun || + ((!sched_interactive || sched_compute) && + NS_TO_JIFFIES(p->u.scsched.runtime + sleep_time) < rr))) { + unsigned long ns_totalrun = p->u.scsched.totalrun + p->u.scsched.runtime; + unsigned long total_run = NS_TO_JIFFIES(ns_totalrun); + p->u.scsched.sflags &= ~SF_FORKED; + if (p->u.scsched.slice - total_run < 1) { + p->u.scsched.totalrun = 0; + dec_burst(p); + } else { + unsigned int intervals = total_run / rr; + unsigned int remainder; + p->u.scsched.totalrun = ns_totalrun; + p->u.scsched.slice -= intervals * rr; + if (p->u.scsched.slice <= rr) { + p->u.scsched.totalrun = 0; + dec_burst(p); + } else { + remainder = p->u.scsched.slice % rr; + if (remainder) + p->u.scsched.time_slice = remainder; + } + } + } else { + if (NS_TO_JIFFIES(p->u.scsched.totalrun) > (best_burst - p->u.scsched.burst) * rr) + dec_burst(p); + else if (!(p->u.scsched.sflags & SF_UISLEEP || p->u.scsched.totalrun)) + inc_burst(p); + p->u.scsched.runtime = 0; + p->u.scsched.totalrun = 0; + } +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + p->u.scsched.slice = slice(p); + p->u.scsched.time_slice = rr_interval(p); + recalc_task_prio(p, now); + p->u.scsched.sflags &= ~SF_UISLEEP; + p->u.scsched.prio = effective_prio(p); + p->u.scsched.timestamp = now; + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) { + p->u.scsched.sflags |= SF_UISLEEP; + rq->nr_uninterruptible++; + } + dequeue_task(p, rq); +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ + int need_resched, nrpolling; + + BUG_ON(!spin_is_locked(&task_rq(p)->lock)); + + /* minimise the chance of sending an interrupt to poll_idle() */ + nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); + nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + + if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) + smp_send_reschedule(task_cpu(p)); +} +#else +static inline void resched_task(task_t *p) +{ + set_tsk_need_resched(p); +} +#endif + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +static int sc_task_curr(const task_t *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +#ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + +typedef struct { + struct list_head list; + enum request_type type; + + /* For REQ_MOVE_TASK */ + task_t *task; + int dest_cpu; + + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + + struct completion done; +} migration_req_t; + +/* + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) +{ + runqueue_t *rq = task_rq(p); + + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!task_queued(p) && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } + + init_completion(&req->done); + req->type = REQ_MOVE_TASK; + req->task = p; + req->dest_cpu = dest_cpu; + list_add(&req->list, &rq->migration_queue); + return 1; +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +static void sc_wait_task_inactive(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + int preempted; + +repeat: + rq = task_rq_lock(p, &flags); + /* Must be off runqueue entirely, not preempted. */ + if (unlikely(task_queued(p))) { + /* If it's preempted, we yield. It could be a while. */ + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); + cpu_relax(); + if (preempted) + yield(); + goto repeat; + } + task_rq_unlock(rq, &flags); +} + +/* + * Return a low guess at the load of a migration-source cpu. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static inline unsigned long source_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return min(rq->cpu_load, load_now); +} + +/* + * Return a high guess at the load of a migration-target cpu + */ +static inline unsigned long target_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return max(rq->cpu_load, load_now); +} + +#endif + +/* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; + + if (idle_cpu(cpu)) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, cpu_online_map); + cpus_and(tmp, tmp, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) + return i; + } + } + else break; + } + return cpu; +} +#else +static inline int wake_idle(int cpu, task_t *p) +{ + return cpu; +} +#endif + +/* + * cache_delay is the time preemption is delayed in sched_compute mode + * and is set to 5*cache_decay_ticks on SMP or a nominal 10ms on UP. + */ +static int cache_delay = 10 * HZ / 1000; + +/* + * Check to see if p preempts rq->curr and resched if it does. In compute + * mode we do not preempt for at least cache_delay and set rq->preempted. + */ +static void preempt(task_t *p, runqueue_t *rq) +{ + if (p->u.scsched.prio > rq->curr->u.scsched.prio) + return; + if (p->u.scsched.prio == rq->curr->u.scsched.prio && (p->u.scsched.totalrun || + rt_task(rq->curr))) + return; + if (!sched_compute || rq->cache_ticks >= cache_delay || + !p->mm || rt_task(p)) + resched_task(rq->curr); + rq->preempted = 1; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int sc_try_to_wake_up(task_t * p, unsigned int state, int sync) +{ + int cpu, this_cpu, success = 0; + unsigned long flags; + long old_state; + runqueue_t *rq, *old_rq; +#ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd; + int new_cpu; +#endif + + old_rq = rq = task_rq_lock(p, &flags); + schedstat_inc(rq->sspcd, ttwu_cnt); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (task_queued(p)) + goto out_running; + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; + + new_cpu = cpu; + + if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + load = source_load(cpu); + this_load = target_load(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) effect of + * the currently running task from the load of the current CPU: + */ + if (sync) + this_load -= SCHED_LOAD_SCALE; + + /* Don't pull the task off an idle CPU to a busy one */ + if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) + goto out_set_cpu; + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + /* + * Scan domains for affine wakeup and passive balancing + * possibilities. + */ + for_each_domain(this_cpu, sd) { + unsigned int imbalance; + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + + if ((sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, sd)) { + /* + * This domain has SD_WAKE_AFFINE and p is cache cold + * in this domain. + */ + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_affine); + goto out_set_cpu; + } + } else if ((sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) { + /* + * This domain has SD_WAKE_BALANCE and there is + * an imbalance. + */ + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_balance); + goto out_set_cpu; + } + } + } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + schedstat_inc(rq->sspcd, ttwu_attempts); + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu) { + schedstat_inc(rq->sspcd, ttwu_moved); + set_task_cpu(p, new_cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (task_queued(p)) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) + old_rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) { + preempt(p, rq); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(rq, &flags); + + return success; +} + +#ifdef CONFIG_SMP +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd); +#endif + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void sc_sched_fork(task_t *p) +{ + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->u.scsched.run_list); + spin_lock_init(&p->switch_lock); +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#ifdef CONFIG_PREEMPT + /* + * During context-switch we hold precisely one spinlock, which + * schedule_tail drops. (in the common case it's this_rq()->lock, + * but it also can be p->switch_lock.) So we compensate with a count + * of 1. Also, we want to start with kernel preemption disabled. + */ + p->thread_info->preempt_count = 1; +#endif +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void sc_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + BUG_ON(p->state != TASK_RUNNING); + + schedstat_inc(rq->sspcd, wunt_cnt); + /* + * Forked process gets no burst to prevent fork bombs. + */ + p->u.scsched.burst = 0; + + if (likely(cpu == this_cpu)) { + current->u.scsched.sflags |= SF_FORKED; + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->u.scsched.timestamp = (p->u.scsched.timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + preempt(p, rq); + + schedstat_inc(rq->sspcd, wunt_moved); + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->u.scsched.sflags: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + current->u.scsched.sflags |= SF_FORKED; + } + task_rq_unlock(this_rq, &flags); +} + +static void sc_sched_exit(task_t * p) +{ +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(task_t *prev) + __releases(rq->lock) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(rq, prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +static void sc_schedule_tail(task_t *prev) + __releases(rq->lock) +{ + finish_task_switch(prev); + + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +static unsigned long sc_nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +static unsigned long sc_nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + return sum; +} + +static unsigned long long sc_nr_context_switches(void) +{ + unsigned long long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +static unsigned long sc_nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +static unsigned long sc_nr_iowait_task_cpu(const task_t *p) +{ + return atomic_read(&task_rq(p)->nr_iowait); +} + +#ifdef CONFIG_SMP + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + if (rq1 == rq2) { + spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + } else + spin_lock(&busiest->lock); + } +} + +/* + * find_idlest_cpu - find the least busy runqueue. + */ +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd) +{ + unsigned long load, min_load, this_load; + int i, min_cpu; + cpumask_t mask; + + min_cpu = UINT_MAX; + min_load = ULONG_MAX; + + cpus_and(mask, sd->span, p->cpus_allowed); + + for_each_cpu_mask(i, mask) { + load = target_load(i); + + if (load < min_load) { + min_cpu = i; + min_load = load; + + /* break out early on an idle CPU: */ + if (!min_load) + break; + } + } + + /* add +1 to account for the new task */ + this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; + + /* + * Would with the addition of the new task to the + * current CPU there be an imbalance between this + * CPU and the idlest CPU? + * + * Use half of the balancing threshold - new-context is + * a good opportunity to balance. + */ + if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) + return min_cpu; + + return this_cpu; +} + +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(task_t *p, int dest_cpu) +{ + migration_req_t req; + runqueue_t *rq; + unsigned long flags; + + rq = task_rq_lock(p, &flags); + if (!cpu_isset(dest_cpu, p->cpus_allowed) + || unlikely(cpu_is_offline(dest_cpu))) + goto out; + + schedstat_inc(rq->sspcd, smt_cnt); + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + get_task_struct(mt); + task_rq_unlock(rq, &flags); + wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + return; + } +out: + task_rq_unlock(rq, &flags); +} + +/* + * sched_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective memory and cache footprint. + */ +static void sc_sched_exec(void) +{ + struct sched_domain *tmp, *sd = NULL; + int new_cpu, this_cpu = get_cpu(); + + schedstat_inc(this_rq()->sspcd, sbe_cnt); + /* Prefer the current CPU if there's only this task running */ + if (this_rq()->nr_running <= 1) + goto out; + + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_EXEC) + sd = tmp; + + if (sd) { + schedstat_inc(sd, sbe_attempts); + new_cpu = find_idlest_cpu(current, this_cpu, sd); + if (new_cpu != this_cpu) { + schedstat_inc(sd, sbe_pushed); + put_cpu(); + sched_migrate_task(current, new_cpu); + return; + } + } +out: + put_cpu(); +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static void pull_task(runqueue_t *src_rq, task_t *p, + runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, src_rq); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_rq); + p->u.scsched.timestamp = (p->u.scsched.timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + preempt(p, this_rq); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + + /* + * Aggressive migration if: + * 1) the [whole] cpu is idle, or + * 2) too many balance attempts have failed. + */ + + if (cpu_and_siblings_are_idle(this_cpu) || \ + sd->nr_balance_failed > sd->cache_nice_tries) + return 1; + + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + return 1; +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(busiest->bitmap); + else + idx = find_next_bit(busiest->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) + goto out; + + head = busiest->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, u.scsched.run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + + /* + * Right now, this is the only place pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_inc(this_rq->sspcd, pt_gained[idle]); + schedstat_inc(busiest->sspcd, pt_lost[idle]); + + pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) +{ + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + + do { + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i); + else + load = source_load(i); + + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = min(max_load - avg_load, avg_load - this_load); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + / SCHED_LOAD_SCALE; + + if (*imbalance < SCHED_LOAD_SCALE - 1) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + if (max_load < tmp) + tmp = max_load; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + + *imbalance = 1; + return busiest; + } + + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; + + return busiest; + +out_balanced: + if (busiest && (idle == NEWLY_IDLE || + (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { + *imbalance = 1; + return busiest; + } + + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; + + for_each_cpu_mask(i, group->cpumask) { + load = source_load(i); + + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved; + + spin_lock(&this_rq->lock); + schedstat_inc(sd, lb_cnt[idle]); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle); + spin_unlock(&busiest->lock); + } + spin_unlock(&this_rq->lock); + + if (!nr_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries; + } + + /* + * We were unbalanced, but unsuccessful in move_tasks(), + * so bump the balance_interval to lessen the lock contention. + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval++; + } else { +sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } + + return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); + + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) { + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + goto out; + } + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + goto out; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE); + if (!nr_moved) + schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + } +} + +/* + * active_load_balance is run by migration threads. It pushes running tasks + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be + * running on each physical CPU where possible, and avoids physical / + * logical imbalances. + * + * Called with busiest_rq locked. + */ +static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) +{ + struct sched_domain *sd; + struct sched_group *cpu_group; + runqueue_t *target_rq; + cpumask_t visited_cpus; + int cpu; + + schedstat_inc(busiest_rq->sspcd, alb_cnt); + /* + * Search for suitable CPUs to push tasks to in successively higher + * domains with SD_LOAD_BALANCE set. + */ + visited_cpus = CPU_MASK_NONE; + for_each_domain(busiest_cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + /* no more domains to search */ + break; + + cpu_group = sd->groups; + do { + for_each_cpu_mask(cpu, cpu_group->cpumask) { + if (busiest_rq->nr_running <= 1) + /* no more tasks left to move */ + return; + if (cpu_isset(cpu, visited_cpus)) + continue; + cpu_set(cpu, visited_cpus); + if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) + continue; + + target_rq = cpu_rq(cpu); + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + if (move_tasks(target_rq, cpu, busiest_rq, + 1, sd, SCHED_IDLE)) { + schedstat_inc(busiest_rq->sspcd, alb_lost); + schedstat_inc(target_rq->sspcd, alb_gained); + } else { + schedstat_inc(busiest_rq->sspcd, alb_failed); + } + spin_unlock(&target_rq->lock); + } + cpu_group = cpu_group->next; + } while (cpu_group != sd->groups); + } +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) +{ + unsigned long old_load, this_load; + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; + + /* Update our load */ + old_load = this_rq->cpu_load; + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (this_load > old_load) + old_load++; + this_rq->cpu_load = (old_load + this_load) / 2; + + for_each_domain(this_cpu, sd) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != SCHED_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) +{ +} +#endif + +static int wake_priority_sleeper(runqueue_t *rq) +{ + int ret = 0; +#ifdef CONFIG_SCHED_SMT + spin_lock(&rq->lock); + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + ret = 1; + } + spin_unlock(&rq->lock); +#endif + return ret; +} + +static void time_slice_expired(task_t *p, runqueue_t *rq) +{ + set_tsk_need_resched(p); + dequeue_task(p, rq); + p->u.scsched.prio = effective_prio(p); + p->u.scsched.time_slice = rr_interval(p); + enqueue_task(p, rq); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +static void sc_scheduler_tick(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; + unsigned long debit; + + rq->timestamp_last_tick = sched_clock(); + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(!task_queued(p))) { + set_tsk_need_resched(p); + goto out; + } + /* + * SCHED_FIFO tasks never run out of timeslice. + */ + if (unlikely(p->policy == SCHED_FIFO)) + goto out; + + spin_lock(&rq->lock); + debit = ns_diff(rq->timestamp_last_tick, p->u.scsched.timestamp); + p->u.scsched.ns_debit += debit; + if (p->u.scsched.ns_debit < NSJIFFY) + goto out_unlock; + p->u.scsched.ns_debit %= NSJIFFY; + /* + * Tasks lose burst each time they use up a full slice(). + */ + if (!--p->u.scsched.slice) { + dec_burst(p); + p->u.scsched.slice = slice(p); + time_slice_expired(p, rq); + goto out_unlock; + } + /* + * Tasks that run out of time_slice but still have slice left get + * requeued with a lower priority && RR_INTERVAL time_slice. + */ + if (!--p->u.scsched.time_slice) { + time_slice_expired(p, rq); + goto out_unlock; + } + rq->cache_ticks++; + if (rq->preempted && rq->cache_ticks >= cache_delay) + set_tsk_need_resched(p); +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + int i; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return; + + /* + * Unlock the current runqueue because we have to lock in + * CPU order to avoid deadlocks. Caller knows that we might + * unlock. We keep IRQs disabled. + */ + spin_unlock(&this_rq->lock); + + sibling_map = sd->span; + + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + /* + * We clear this CPU from the mask. This both simplifies the + * inner loop and keps this_rq locked when we exit: + */ + cpu_clear(this_cpu, sibling_map); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } + + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + /* + * We exit with this_cpu's rq still held and IRQs + * still disabled: + */ +} + +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + int ret = 0, i; + task_t *p; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + /* + * The same locking rules and details apply as for + * wake_sleeping_dependent(): + */ + spin_unlock(&this_rq->lock); + sibling_map = sd->span; + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + cpu_clear(this_cpu, sibling_map); + + /* + * Establish next task to be run - it might have gone away because + * we released the runqueue lock above: + */ + if (!this_rq->nr_running) + goto out_unlock; + + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next, + task_t, u.scsched.run_list); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + task_t *smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->u.scsched.slice * (100 - sd->per_cpu_gain) / 100) > + slice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret = 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->u.scsched.slice * (100 - sd->per_cpu_gain) / 100) > + slice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } +out_unlock: + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + return ret; +} +#else +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ +} + +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + return 0; +} +#endif + +/* + * schedule() is the main scheduler function. + */ +static void __sched sc_schedule(void) +{ + long *switch_count; + task_t *prev, *next; + runqueue_t *rq; + struct list_head *queue; + unsigned long long now; + unsigned long debit; + int cpu, idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } + } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); +need_resched_nonpreemptible: + rq = this_rq(); + + /* + * The idle thread is not allowed to schedule! + * Remove this check after it has been exercised a bit. + */ + if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) { + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + } + + schedstat_inc(rq->sspcd, sched_cnt); + now = sched_clock(); + + spin_lock_irq(&rq->lock); + prev->u.scsched.runtime = ns_diff(now, prev->u.scsched.timestamp); + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY; + prev->u.scsched.ns_debit += debit; + + if (unlikely(current->flags & PF_DEAD)) + current->state = EXIT_DEAD; + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else + deactivate_task(prev, rq); + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; + goto switch_tasks; + } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; + } + + idx = sched_find_first_bit(rq->bitmap); + queue = rq->queue + idx; + next = list_entry(queue->next, task_t, u.scsched.run_list); + +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq->sspcd, sched_goidle); + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + prev->u.scsched.timestamp = now; + if (next->u.scsched.sflags & SF_YIELDED) { + /* + * Tasks that have yield()ed get requeued at normal priority + */ + int newprio = effective_prio(next); + next->u.scsched.sflags &= ~SF_YIELDED; + if (newprio != next->u.scsched.prio) { + dequeue_task(next, rq); + next->u.scsched.prio = newprio; + enqueue_task_head(next, rq); + } + } + + sched_info_switch(prev, next); + if (likely(prev != next)) { + rq->preempted = 0; + rq->cache_ticks = 0; + next->u.scsched.timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) + goto need_resched_nonpreemptible; + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +static void __sched sc_wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} + +static void sc_set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + runqueue_t *rq; + int queued, old_prio, new_prio, delta; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + if ((queued = task_queued(p))) + dequeue_task(p, rq); + + old_prio = p->u.scsched.prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->u.scsched.prio += delta; + + if (queued) { + enqueue_task(p, rq); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_KGDB +static struct task_struct *sc_kgdb_get_idle(int this_cpu) +{ + return cpu_rq(this_cpu)->idle; +} +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +static int sc_task_prio(const task_t *p) +{ + return p->u.scsched.prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +static int sc_task_nice(const task_t *p) +{ + return TASK_NICE(p); +} + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +static int sc_idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + BUG_ON(task_queued(p)); + p->policy = policy; + p->rt_priority = prio; + if (policy != SCHED_NORMAL) + p->u.scsched.prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + p->u.scsched.prio = p->static_prio; +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static int sc_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + int queued, oldprio, oldpolicy = -1; + unsigned long flags; + runqueue_t *rq; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock; +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL) + goto out_unlock; + } + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) + goto out_unlock; + if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p, policy, &lp); + if (retval) + goto out_unlock; + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, &flags); + goto recheck; + } + if ((queued = task_queued(p))) + deactivate_task(p, task_rq(p)); + retval = 0; + oldprio = p->u.scsched.prio; + __setscheduler(p, policy, lp.sched_priority); + if (queued) { + __activate_task(p, task_rq(p)); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->u.scsched.prio > oldprio) + resched_task(rq->curr); + } else + preempt(p, rq); + } + task_rq_unlock(rq, &flags); +out_unlock: + read_unlock_irq(&tasklist_lock); +out_nounlock: + return retval; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by dropping the priority of current + * to the lowest priority and setting the SF_YIELDED flag. + */ +static long sc_sys_sched_yield(void) +{ + int newprio; + runqueue_t *rq = this_rq_lock(); + + newprio = current->u.scsched.prio; + schedstat_inc(rq->sspcd, yld_cnt); + current->u.scsched.slice = slice(current); + current->u.scsched.time_slice = rr_interval(current); + if (likely(!rt_task(current))) { + current->u.scsched.sflags |= SF_YIELDED; + newprio = MAX_PRIO - 1; + } + + if (newprio != current->u.scsched.prio) { + dequeue_task(current, rq); + current->u.scsched.prio = newprio; + enqueue_task(current, rq); + } else + requeue_task(current, rq); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +static void __sched sc_io_schedule(void) +{ + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +static long __sched sc_io_schedule_timeout(long timeout) +{ + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +static long +sc_sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : slice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static void __devinit sc_init_idle(task_t *idle, int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long flags; + + idle->u.scsched.prio = MAX_PRIO; + idle->state = TASK_RUNNING; + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; + set_tsk_need_resched(idle); + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int sc_set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + unsigned long flags; + int ret = 0; + migration_req_t req; + runqueue_t *rq; + + perfctr_set_cpus_allowed(p, new_mask); + + rq = task_rq_lock(p, &flags); + if (!cpus_intersects(new_mask, cpu_online_map)) { + ret = -EINVAL; + goto out; + } + + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, &flags); + return ret; +} + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + runqueue_t *rq_dest, *rq_src; + + if (unlikely(cpu_is_offline(dest_cpu))) + return; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; + + set_task_cpu(p, dest_cpu); + if (task_queued(p)) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->u.scsched.timestamp = p->u.scsched.timestamp - + rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + activate_task(p, rq_dest, 0); + preempt(p, rq_dest); + } + +out: + double_rq_unlock(rq_src, rq_dest); +} + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void * data) +{ + runqueue_t *rq; + int cpu = (long)data; + + rq = cpu_rq(cpu); + BUG_ON(rq->migration_thread != current); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + struct list_head *head; + migration_req_t *req; + + if (current->flags & PF_FREEZE) + refrigerator(PF_FREEZE); + + spin_lock_irq(&rq->lock); + + if (cpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + + head = &rq->migration_queue; + + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + + if (req->type == REQ_MOVE_TASK) { + spin_unlock(&rq->lock); + __migrate_task(req->task, smp_processor_id(), + req->dest_cpu); + local_irq_enable(); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + spin_unlock_irq(&rq->lock); + } else { + spin_unlock_irq(&rq->lock); + WARN_ON(1); + } + + complete(&req->done); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* Figure out where task on dead CPU should go, use force if neccessary. */ +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) +{ + int dest_cpu; + cpumask_t mask; + + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + tsk->cpus_allowed = cpuset_cpus_allowed(tsk); + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (tsk->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, dead_cpu); + } + __migrate_task(tsk, dead_cpu, dest_cpu); +} + +/* Run through task list and migrate tasks from the dead cpu. */ +static void migrate_live_tasks(int src_cpu) +{ + struct task_struct *tsk, *t; + + write_lock_irq(&tasklist_lock); + + do_each_thread(t, tsk) { + if (tsk == current) + continue; + + if (task_cpu(tsk) == src_cpu) + move_task_off_dead_cpu(src_cpu, tsk); + } while_each_thread(t, tsk); + + write_unlock_irq(&tasklist_lock); +} + +/* Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible and adding it to + * the _front_ of runqueue. Used by CPU offline code. + */ +static void sc_sched_idle_next(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + struct task_struct *p = rq->idle; + unsigned long flags; + + /* cpu has to be offline */ + BUG_ON(cpu_online(cpu)); + + /* Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on current cpu. + */ + spin_lock_irqsave(&rq->lock, flags); + + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(p, rq); + + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void migrate_dead(unsigned int dead_cpu, task_t *tsk) +{ + struct runqueue *rq = cpu_rq(dead_cpu); + + /* Must be exiting, otherwise would be on tasklist. */ + BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); + + /* Cannot have done final schedule yet: would have vanished. */ + BUG_ON(tsk->flags & PF_DEAD); + + get_task_struct(tsk); + + /* + * Drop lock around migration; if someone else moves it, + * that's OK. No task can be added to this CPU, so iteration is + * fine. + */ + spin_unlock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, tsk); + spin_lock_irq(&rq->lock); + + put_task_struct(tsk); +} + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (i = 0; i < MAX_PRIO; i++) { + struct list_head *list = &rq->queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + u.scsched.run_list)); + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int migration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct runqueue *rq; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio: stop_machine expects to yield to it. */ + rq = task_rq_lock(p, &flags); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); + cpu_rq(cpu)->migration_thread = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(cpu_rq(cpu)->migration_thread); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); + kthread_stop(cpu_rq(cpu)->migration_thread); + cpu_rq(cpu)->migration_thread = NULL; + break; + case CPU_DEAD: + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + kthread_stop(rq->migration_thread); + rq->migration_thread = NULL; + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); + rq->idle->static_prio = MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); + BUG_ON(rq->nr_running != 0); + + /* No need to migrate the tasks: it was best-effort if + * they didn't do lock_cpu_hotplug(). Just wake up + * the requestors. */ + spin_lock_irq(&rq->lock); + while (!list_empty(&rq->migration_queue)) { + migration_req_t *req; + req = list_entry(rq->migration_queue.next, + migration_req_t, list); + BUG_ON(req->type != REQ_MOVE_TASK); + list_del_init(&req->list); + complete(&req->done); + } + spin_unlock_irq(&rq->lock); + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = 10 +}; + +static int __init sc_migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + return 0; +} +#endif + +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHEDSTATS +static void sc_show_schedstat_sd(struct seq_file *seq, int cpu) +{ + enum idle_type itype; + struct sched_domain *sd; + int dcnt = 0; + + for_each_domain(cpu, sd) { + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu", + sd->lb_cnt[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu\n", + sd->sbe_pushed, sd->sbe_attempts, + sd->ttwu_wake_affine, sd->ttwu_wake_balance); + } +} +#endif + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void __devinit sc_cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + migration_req_t req; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + int local = 1; + + spin_lock_irqsave(&rq->lock, flags); + + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); + + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } +} + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + int ints[NR_CPUS], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + cpus_clear(cpu_isolated_map); + for (i = 1; i <= ints[0]; i++) + cpu_set(ints[i], cpu_isolated_map); + return 1; +} + +__setup ("isolcpus=", isolated_cpu_setup); + +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __devinit arch_init_sched_domains(void); +extern void __devinit arch_destroy_sched_domains(void); +#else +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int __devinit cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int __devinit cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA + +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static int __devinit cpu_to_node_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) +/* + * The domains setup code relies on siblings not spanning + * multiple nodes. Make sure the architecture has a proper + * siblings map: + */ +static void check_sibling_maps(void) +{ + int i, j; + + for_each_online_cpu(i) { + for_each_cpu_mask(j, cpu_sibling_map[i]) { + if (cpu_to_node(i) != cpu_to_node(j)) { + printk(KERN_INFO "warning: CPU %d siblings map " + "to different node - isolating " + "them.\n", i); + cpu_sibling_map[i] = cpumask_of_cpu(i); + break; + } + } + } +} +#endif + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void __devinit arch_init_sched_domains(void) +{ + int i; + cpumask_t cpu_default_map; + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); + + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + + cpus_and(nodemask, nodemask, cpu_default_map); + +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + group = cpu_to_node_group(i); + *sd = SD_NODE_INIT; + sd->span = cpu_default_map; + sd->groups = &sched_group_nodes[group]; +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + cpus_and(sd->span, sd->span, cpu_default_map); + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_online_cpu(i) { + cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + if (i != first_cpu(this_sibling_map)) + continue; + + init_sched_build_groups(sched_group_cpus, this_sibling_map, + &cpu_to_cpu_group); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); + } + +#ifdef CONFIG_NUMA + /* Set up node groups */ + init_sched_build_groups(sched_group_nodes, cpu_default_map, + &cpu_to_node_group); +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu_mask(i, cpu_default_map) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + +#ifdef CONFIG_NUMA + if (i == first_cpu(sd->groups->cpumask)) { + /* Only add "power" once for each physical package. */ + sd = &per_cpu(node_domains, i); + sd->groups->cpu_power += power; + } +#endif + } + + /* Attach the domains */ + for_each_online_cpu(i) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); + } + init_sched_domain_sysctl(); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __devinit arch_destroy_sched_domains(void) +{ + destroy_sched_domain_sysctl(); +} +#endif + +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +#define SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +static void sched_domain_debug(void) +{ + int i; + + for_each_online_cpu(i) { + runqueue_t *rq = cpu_rq(i); + struct sched_domain *sd; + int level = 0; + + sd = rq->sd; + + printk(KERN_DEBUG "CPU%d:\n", i); + + do { + int j; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (j = 0; j < level + 1; j++) + printk(" "); + printk("domain %d: ", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance"); + if (sd->parent) + printk(" ERROR !SD_LOAD_BALANCE domain has parent"); + printk("\n"); + } + + printk("span %s\n", str); + + if (!cpu_isset(i, sd->span)) + printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + if (!cpu_isset(i, group->cpumask)) + printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + if (!group->cpu_power) + printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); + + printk(KERN_DEBUG); + for (j = 0; j < level + 2; j++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk(" ERROR: NULL"); + break; + } + + if (!cpus_weight(group->cpumask)) + printk(" ERROR empty group:"); + + if (cpus_intersects(groupmask, group->cpumask)) + printk(" ERROR repeated CPUs:"); + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + } + + } while (sd); + } +} +#else +#define sched_domain_debug() {} +#endif + +/* + * Initial dummy domain for early boot and for hotplug cpu. Being static, + * it is initialized to zero, so all balancing flags are cleared which is + * what we want. + */ +static struct sched_domain sched_domain_dummy; + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Force a reinitialization of the sched domains hierarchy. The domains + * and groups cannot be updated in place without racing with the balancing + * code, so we temporarily attach all running cpus to a "dummy" domain + * which will prevent rebalancing while the sched domains are recalculated. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int i; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + for_each_online_cpu(i) + cpu_attach_domain(&sched_domain_dummy, i); + arch_destroy_sched_domains(); + return NOTIFY_OK; + + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + case CPU_DEAD: + /* + * Fall through and re-initialise the domains. + */ + break; + default: + return NOTIFY_DONE; + } + + /* The hotplug lock is already held by cpu_up/cpu_down */ + arch_init_sched_domains(); + + sched_domain_debug(); + + return NOTIFY_OK; +} +#endif + +static void __init sc_sched_init_smp(void) +{ + lock_cpu_hotplug(); + arch_init_sched_domains(); + sched_domain_debug(); + unlock_cpu_hotplug(); + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); +} +#else +static void __init sc_sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +static void __init sc_sched_init(void) +{ + runqueue_t *rq; + int i, j; + + init_task.u.scsched.prio = MAX_PRIO - 20; + init_task.static_prio = MAX_PRIO - 20; + INIT_LIST_HEAD(&init_task.u.scsched.run_list); + init_task.u.scsched.slice = HZ; + init_task.u.scsched.time_slice = HZ; + + for (i = 0; i < NR_CPUS; i++) { + + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + rq->cache_ticks = 0; + rq->preempted = 0; + +#ifdef CONFIG_SMP + rq->sd = &sched_domain_dummy; + rq->cpu_load = 0; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); + cache_delay = cache_decay_ticks * 5; +#endif + atomic_set(&rq->nr_iowait, 0); + for (j = 0; j <= MAX_PRIO; j++) + INIT_LIST_HEAD(&rq->queue[j]); + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO+1)*sizeof(long)); + /* + * delimiter for bitsearch + */ + __set_bit(MAX_PRIO, rq->bitmap); +#ifdef CONFIG_SCHEDSTATS + rq->sspcd = cpu_sspcd(i); +#endif + } + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); + +#ifdef CONFIG_SMP + /* + * We are now the idle thread so we need to do some magic to boot + * on SMP. + */ + current->u.scsched.prio = MAX_PRIO - 2; +#endif +} + +#if defined(CONFIG_DEBUG_KERNEL)&&defined(CONFIG_SYSCTL)&&defined(CONFIG_SMP) +static struct ctl_table sd_ctl_dir[] = { + {1, "sched_domain", NULL, 0, 0755, NULL, }, + {0,}, +}; + +static struct ctl_table sd_ctl_root[] = { + {1, "kernel", NULL, 0, 0755, sd_ctl_dir, }, + {0,}, +}; +static char *sched_strdup(char *str) +{ + int n = strlen(str)+1; + char *s = kmalloc(n, GFP_KERNEL); + if (!s) + return NULL; + return strcpy(s, str); +} +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + BUG_ON(!entry); + memset(entry, 0, n * sizeof(struct ctl_table)); + return entry; +} + +static void +set_table_entry(struct ctl_table *entry, int ctl_name, const char *procname, + void *data, int maxlen, mode_t mode, + proc_handler *proc_handler) +{ + entry->ctl_name = ctl_name; + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table; + table = sd_alloc_ctl_entry(9); + + set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], 3, "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], 4, "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], 5, "cache_hot_time", &sd->cache_hot_time, + sizeof(long long), 0644, proc_doulonglongvec_minmax); + set_table_entry(&table[5], 6, "cache_nice_tries", &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], 7, "per_cpu_gain", &sd->per_cpu_gain, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], 8, "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct sched_domain *sd; + int domain_num = 0, i; + struct ctl_table *entry, *table; + char buf[32]; + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + + i = 0; + for_each_domain(cpu, sd) { + sprintf(buf, "domain-%d", i); + entry->ctl_name = i + 1; + entry->procname = sched_strdup(buf); + entry->mode = 0755; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void sc_init_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + char buf[32]; + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + + sd_ctl_dir[0].child = entry; + + for (i = 0; i < cpu_num; i++, entry++) { + sprintf(buf, "cpu%d", i); + entry->ctl_name = i + 1; + entry->procname = sched_strdup(buf); + entry->mode = 0755; + entry->child = sd_alloc_ctl_cpu_table(i); + } + sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0); +} + +static void sc_destroy_sched_domain_sysctl(void) +{ + int cpu, cpu_num = num_online_cpus(); + struct sched_domain *sd; + struct ctl_table *root = sd_ctl_dir[0].child; + struct ctl_table *entry, *table; + + unregister_sysctl_table(sd_sysctl_header); + entry = root; + for (cpu = 0; cpu < cpu_num; cpu++, entry++) { + kfree(entry->procname); + table = entry->child; + for_each_domain(cpu, sd) { + kfree(table->procname); + kfree(table->child); + table++; + } + kfree(entry->child); + } + kfree(root); +} +#else +static void sc_init_sched_domain_sysctl(void) +{ +} +static void sc_destroy_sched_domain_sysctl(void) +{ +} +#endif + +static int sc_is_idle_task(const task_t *p) +{ + return p == task_rq(p)->idle; +} + +#ifdef CONFIG_MAGIC_SYSRQ +void sc_normalise_rt_tasks(void) +{ + struct task_struct *p; + unsigned long flags; + runqueue_t *rq; + int queued; + + read_lock_irq(&tasklist_lock); + for_each_process (p) { + if (!rt_task(p)) + continue; + + rq = task_rq_lock(p, &flags); + + if ((queued = task_queued(p))) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); + if (queued) { + __activate_task(p, task_rq(p)); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); + } + read_unlock_irq(&tasklist_lock); +} +#endif /* CONFIG_MAGIC_SYSRQ */ + +struct sched_drv sc_sched_drv = { + .task_cpu = common_task_cpu, + .set_task_cpu = common_set_task_cpu, + .init_sched_domain_sysctl = sc_init_sched_domain_sysctl, + .destroy_sched_domain_sysctl = sc_destroy_sched_domain_sysctl, + .cpusched_name = "staircase", + .rt_task = sc_rt_task, + .wait_for_completion = sc_wait_for_completion, + .io_schedule = sc_io_schedule, + .io_schedule_timeout = sc_io_schedule_timeout, + .set_oom_timeslice = sc_set_oom_timeslice, + .nr_running = sc_nr_running, + .nr_uninterruptible = sc_nr_uninterruptible, + .nr_context_switches = sc_nr_context_switches, + .nr_iowait = sc_nr_iowait, + .nr_iowait_task_cpu = sc_nr_iowait_task_cpu, + .idle_cpu = sc_idle_cpu, + .init_idle = sc_init_idle, + .exit = sc_sched_exit, + .fork = sc_sched_fork, + .init = sc_sched_init, + .init_smp = sc_sched_init_smp, + .schedule = sc_schedule, + .tick = sc_scheduler_tick, + .tail = sc_schedule_tail, + .setscheduler = sc_setscheduler, + .set_user_nice = sc_set_user_nice, + .rr_get_interval = sc_sys_sched_rr_get_interval, + .yield = sc_sys_sched_yield, + .is_idle_task = sc_is_idle_task, + .task_curr = sc_task_curr, + .task_nice = sc_task_nice, + .task_prio = sc_task_prio, + .try_to_wake_up = sc_try_to_wake_up, + .wake_up_new_task = sc_wake_up_new_task, +#ifdef CONFIG_SMP + .migration_init = sc_migration_init, + .exec = sc_sched_exec, + .set_cpus_allowed = sc_set_cpus_allowed, + .wait_task_inactive = sc_wait_task_inactive, + .cpu_attach_domain = sc_cpu_attach_domain, +#ifdef CONFIG_HOTPLUG_CPU + .sched_idle_next = sc_sched_idle_next, +#endif +#ifdef CONFIG_SCHEDSTATS + .show_schedstat_sd = sc_show_schedstat_sd, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_tasks = sc_normalise_rt_tasks, +#endif +#ifdef CONFIG_KGDB + .kgdb_get_idle = sc_kgdb_get_idle, +#endif +}; Index: linux-2.6.10-rc1-mm5/mm/oom_kill.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/mm/oom_kill.c 2004-11-11 21:42:17.000000000 +1100 +++ linux-2.6.10-rc1-mm5/mm/oom_kill.c 2004-11-11 21:48:08.000000000 +1100 @@ -156,7 +156,7 @@ static void __oom_kill_task(task_t *p) * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->time_slice = HZ; + set_oom_timeslice(p); p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */