Index: linux-2.6.10-rc1-mm5/fs/proc/array.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/fs/proc/array.c 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/fs/proc/array.c 2004-11-11 21:50:23.000000000 +1100 @@ -163,7 +163,6 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -171,7 +170,6 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, p->pid ? p->group_leader->real_parent->tgid : 0, p->pid && p->ptrace ? p->parent->pid : 0, Index: linux-2.6.10-rc1-mm5/fs/proc/proc_misc.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/fs/proc/proc_misc.c 2004-11-11 22:08:03.000000000 +1100 +++ linux-2.6.10-rc1-mm5/fs/proc/proc_misc.c 2004-11-11 22:08:30.000000000 +1100 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,18 @@ static int version_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +static int scheduler_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + char *sched_name = scheduler->cpusched_name; + int len; + + strcpy(page, sched_name); + strcat(page, "\n"); + len = strlen(page); + return proc_calc_metrics(page, start, off, count, eof, len); +} + extern struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { @@ -569,6 +582,7 @@ void __init proc_misc_init(void) {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"scheduler", scheduler_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) Index: linux-2.6.10-rc1-mm5/include/linux/init_task.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/init_task.h 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/init_task.h 2004-11-11 21:50:20.000000000 +1100 @@ -72,14 +72,10 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ - .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ @@ -115,5 +111,4 @@ extern struct group_info init_groups; .private_pages = LIST_HEAD_INIT(tsk.private_pages), \ .private_pages_count = 0, \ } - #endif Index: linux-2.6.10-rc1-mm5/include/linux/sched.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/sched.h 2004-11-11 21:42:18.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/sched.h 2004-11-11 22:08:34.000000000 +1100 @@ -32,6 +32,7 @@ #include #include #include +#include struct exec_domain; @@ -165,9 +166,6 @@ extern void show_regs(struct pt_regs *); */ extern void show_stack(struct task_struct *task, unsigned long *sp); -void io_schedule(void); -long io_schedule_timeout(long timeout); - extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); @@ -179,6 +177,9 @@ extern unsigned long cache_decay_ticks; /* Is this address in the __sched functions? */ extern int in_sched_functions(unsigned long addr); +void __sched io_schedule(void); +long __sched io_schedule_timeout(long timeout); + #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); @@ -332,11 +333,6 @@ struct signal_struct { }; /* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are - * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values - * are inverted: lower p->prio value means higher priority. - * * The MAX_USER_RT_PRIO value allows the actual maximum * RT priority to be separate from the value exported to * user-space. This allows kernel threads to set their @@ -347,9 +343,7 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) - -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +extern int rt_task(task_t *p); /* * Some day this will be a full-fledged user tracking system.. @@ -382,21 +376,6 @@ typedef struct prio_array prio_array_t; struct backing_dev_info; struct reclaim_state; -#ifdef CONFIG_SCHEDSTATS -struct sched_info { - /* cumulative counters */ - unsigned long cpu_time, /* time spent on the cpu */ - run_delay, /* time spent waiting on a runqueue */ - pcnt; /* # of timeslices run on this cpu */ - - /* timestamps */ - unsigned long last_arrival, /* when we last ran on a cpu */ - last_queued; /* when we were last queued to run */ -}; - -extern struct file_operations proc_schedstat_operations; -#endif - enum idle_type { SCHED_IDLE, @@ -405,6 +384,8 @@ enum idle_type MAX_IDLE_TYPES }; +#include + /* * sched-domains (multiprocessor balancing) declarations: */ @@ -467,13 +448,11 @@ struct sched_domain { #endif }; -#ifdef ARCH_HAS_SCHED_DOMAIN /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */ -extern cpumask_t cpu_isolated_map; +extern void cpu_attach_domain(struct sched_domain *sd, int cpu); extern void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)); -extern void cpu_attach_domain(struct sched_domain *sd, int cpu); -#endif /* ARCH_HAS_SCHED_DOMAIN */ +extern cpumask_t cpu_isolated_map; #endif /* CONFIG_SMP */ @@ -517,6 +496,10 @@ int set_current_groups(struct group_info struct audit_context; /* See audit.c */ struct mempolicy; +#include + +extern struct sched_drv *scheduler; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -526,17 +509,11 @@ struct task_struct { int lock_depth; /* Lock depth */ - int prio, static_prio; - struct list_head run_list; - prio_array_t *array; - - unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - int activated; + int static_prio; /* A commonality between cpu schedulers */ + union cpusched u; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -747,8 +724,11 @@ extern void sched_idle_next(void); extern void set_user_nice(task_t *p, long nice); extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); +extern int is_idle_task(const task_t *p); extern int task_curr(const task_t *p); extern int idle_cpu(int cpu); +extern void set_oom_timeslice(task_t *p); +extern task_t *find_process_by_pid(pid_t pid); void yield(void); @@ -774,6 +754,7 @@ static inline int kstack_end(void *addr) extern union thread_union init_thread_union; extern struct task_struct init_task; +extern struct task_struct base_init_task; extern struct mm_struct init_mm; @@ -1095,33 +1076,8 @@ extern void recalc_sigpending(void); extern void signal_wake_up(struct task_struct *t, int resume_stopped); -/* - * Wrappers for p->thread_info->cpu access. No-op on UP. - */ -#ifdef CONFIG_SMP - -static inline unsigned int task_cpu(const struct task_struct *p) -{ - return p->thread_info->cpu; -} - -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - p->thread_info->cpu = cpu; -} - -#else - -static inline unsigned int task_cpu(const struct task_struct *p) -{ - return 0; -} - -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -} - -#endif /* CONFIG_SMP */ +extern unsigned int task_cpu(const struct task_struct *p); +extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT extern void arch_pick_mmap_layout(struct mm_struct *mm); Index: linux-2.6.10-rc1-mm5/include/linux/schedstats.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/schedstats.h 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/schedstats.h 2004-11-11 21:53:42.000000000 +1100 @@ -0,0 +1,78 @@ +#ifndef _LINUX_SCHEDSTATS_H +#define _LINUX_SCHEDSTATS_H + +#ifdef CONFIG_SCHEDSTATS +struct sched_info { + /* cumulative counters */ + unsigned long cpu_time, /* time spent on the cpu */ + run_delay, /* time spent waiting on a runqueue */ + pcnt; /* # of timeslices run on this cpu */ + + /* timestamps */ + unsigned long last_arrival, /* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ +}; + +typedef struct schedstat_per_cpu_data schedstat_pcd_t; + +struct schedstat_per_cpu_data { + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* pull_task() stats */ + unsigned long pt_gained[MAX_IDLE_TYPES]; + unsigned long pt_lost[MAX_IDLE_TYPES]; + + /* active_load_balance() stats */ + unsigned long alb_cnt; + unsigned long alb_lost; + unsigned long alb_gained; + unsigned long alb_failed; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_attempts; + unsigned long ttwu_moved; + + /* wake_up_new_task() stats */ + unsigned long wunt_cnt; + unsigned long wunt_moved; + + /* sched_migrate_task() stats */ + unsigned long smt_cnt; + + /* sched_balance_exec() stats */ + unsigned long sbe_cnt; +}; + +extern struct file_operations proc_schedstat_operations; +extern DEFINE_PER_CPU(struct schedstat_per_cpu_data, schedstat_pcd_data); + +#define cpu_sspcd(cpu) (&per_cpu(schedstat_pcd_data, (cpu))) +#define task_sspcd(cpu) (cpu_sspcd(task_cpu(cpu))) + +extern void sched_info_switch(task_t *prev, task_t *next); +extern void sched_info_queued(task_t *t); + +# define schedstat_inc(sspcd, field) sspcd->field++; +# define schedstat_add(sspcd, field, amt) sspcd->field += amt; +#else /* !CONFIG_SCHEDSTATS */ +# define schedstat_inc(sspcd, field) do { } while (0); +# define schedstat_add(sspcd, field, amt) do { } while (0); +# define sched_info_queued(t) do { } while (0) +# define sched_info_switch(t, next) do { } while (0) +#endif + +#endif /* _LINUX_SCHEDSTATS_H */ Index: linux-2.6.10-rc1-mm5/include/linux/scheduler.h =================================================================== --- linux-2.6.10-rc1-mm5.orig/include/linux/scheduler.h 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/include/linux/scheduler.h 2004-11-11 22:08:35.000000000 +1100 @@ -0,0 +1,129 @@ +#ifndef _LINUX_SCHEDULER_H +#define _LINUX_SCHEDULER_H +/* + * include/linux/scheduler.h + * This contains the driver struct for all the exported per-cpu-scheduler + * functions, and the private per-scheduler data in task_struct. + */ + +#define SCHED_NAME_MAX (16) + +/* + * This is the main scheduler driver struct. + */ +struct sched_drv +{ + unsigned int (*task_cpu)(const struct task_struct *); + void (*set_task_cpu)(struct task_struct *, unsigned int); + void (*init_sched_domain_sysctl)(void); + void (*destroy_sched_domain_sysctl)(void); + char cpusched_name[SCHED_NAME_MAX]; + int (*rt_task)(const task_t *); + void (*wait_for_completion)(struct completion *); + void (*io_schedule)(void); + long (*io_schedule_timeout)(long); + void (*sched_idle_next)(void); + void (*set_oom_timeslice)(task_t *); + unsigned long (*nr_running)(void); + unsigned long (*nr_uninterruptible)(void); + unsigned long long (*nr_context_switches)(void); + unsigned long (*nr_iowait)(void); + unsigned long (*nr_iowait_task_cpu)(const task_t *); + int (*idle_cpu)(int); + void (*init_idle)(task_t *, int); + void (*exit)(task_t *); + void (*fork)(task_t *); + void (*init)(void); + void (*init_smp)(void); + void (*schedule)(void); + void (*tick)(void); + void (*tail)(task_t *); + int (*setscheduler)(pid_t, int, struct sched_param __user *); + void (*set_user_nice)(task_t *, long); + long (*rr_get_interval)(pid_t, struct timespec __user *); + long (*yield)(void); + int (*is_idle_task)(const task_t *); + int (*task_curr)(const task_t *); + int (*task_nice)(const task_t *); + int (*task_prio)(const task_t *); + int (*try_to_wake_up)(task_t *, unsigned, int); + void (*wake_up_new_task)(task_t *, unsigned long); +#ifdef CONFIG_SMP + int (*migration_init)(void); + void (*exec)(void); + int (*set_cpus_allowed)(task_t *, cpumask_t); + void (*wait_task_inactive)(task_t *); + void (*cpu_attach_domain)(struct sched_domain *, int); +#ifdef CONFIG_SCHEDSTATS + void (*show_schedstat_sd)(struct seq_file *, int); +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + void (*normalize_rt_tasks)(void); +#endif +#ifdef CONFIG_KGDB + struct task_struct * (*kgdb_get_idle)(int); +#endif +}; + +/* + * List functions that have common variants that many schedulers use. + */ +extern unsigned int common_task_cpu(const struct task_struct *p); +extern void common_set_task_cpu(struct task_struct *p, unsigned int cpu); + +/* + * All private per-scheduler entries in task_struct are defined here as + * separate structs placed into the cpusched union in task_struct. + */ + +/* Ingosched */ +#ifdef CONFIG_CPUSCHED_INGO +struct cpusched_ingo { + int prio; + struct list_head run_list; + prio_array_t *array; + unsigned int time_slice; + unsigned int first_time_slice; + unsigned long sleep_avg; + unsigned long timestamp; + unsigned long long last_ran; + int activated; +}; +#endif + +/* Staircase scheduler */ +#ifdef CONFIG_CPUSCHED_STAIRCASE +struct cpusched_sc { + int prio; + struct list_head run_list; + unsigned long sflags; + unsigned long long timestamp; + unsigned long runtime, totalrun, ns_debit; + unsigned int burst; + unsigned int slice, time_slice; +}; +#endif + +/* Minisched scheduler */ +#ifdef CONFIG_CPUSCHED_MINISCHED +struct cpusched_ms { + int prio; + struct list_head run_list; + unsigned int time_slice; +}; +#endif + +union cpusched { +#ifdef CONFIG_CPUSCHED_INGO + struct cpusched_ingo ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct cpusched_sc scsched; +#endif +#ifdef CONFIG_CPUSCHED_MINISCHED + struct cpusched_ms mssched; +#endif +}; + +#endif Index: linux-2.6.10-rc1-mm5/init/Kconfig =================================================================== --- linux-2.6.10-rc1-mm5.orig/init/Kconfig 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/init/Kconfig 2004-11-11 22:08:35.000000000 +1100 @@ -249,6 +249,48 @@ config IKCONFIG_PROC through /proc/config.gz. +config PLUGSCHED + bool "Support for multiple cpu schedulers" + default y + help + Say Y here if you want to compile in support for multiple + cpu schedulers. The cpu scheduler may be selected at boot time + with the boot parameter "cpusched=". The choice of which cpu + schedulers to compile into the kernel can be made by enabling + "Configure standard kernel features" otherwise all cpu schedulers + supported will be compiled in. + +choice + prompt "Default cpu scheduler" + help + This option allows you to choose which cpu scheduler shall be + booted by default at startup if you have plugsched support, or + it will choose which is the only scheduler compiled in. + +config CPUSCHED_DEFAULT_INGO + bool "Ingosched cpu scheduler" + select CPUSCHED_INGO + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + +config CPUSCHED_DEFAULT_STAIRCASE + bool "Staircase cpu scheduler" + select CPUSCHED_STAIRCASE + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + +config CPUSCHED_DEFAULT_MINISCHED + bool "Minisched cpu scheduler" + depends on !SMP + select CPUSCHED_MINISCHED + ---help--- + This scheduler is a low overhead O(1) single priority rr scheduler + for uniprocessor only. + +endchoice + menuconfig EMBEDDED bool "Configure standard kernel features (for small systems)" help @@ -257,6 +299,36 @@ menuconfig EMBEDDED environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config CPUSCHED_INGO + bool "Ingosched cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=ingosched". + +config CPUSCHED_STAIRCASE + bool "Staircase cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=staircase". + +config CPUSCHED_MINISCHED + bool "Minisched cpu scheduler" if EMBEDDED + depends on PLUGSCHED && !SMP + default y + ---help--- + This scheduler is a low overhead O(1) single priority rr scheduler + for uniprocessor only. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=minisched". + config KALLSYMS bool "Load all symbols for debugging/kksymoops" if EMBEDDED default y Index: linux-2.6.10-rc1-mm5/init/main.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/init/main.c 2004-11-11 21:42:16.000000000 +1100 +++ linux-2.6.10-rc1-mm5/init/main.c 2004-11-11 21:52:40.000000000 +1100 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -416,10 +417,11 @@ void __init parse_early_param(void) done = 1; } +struct task_struct base_init_task; + /* * Activate the first processor. */ - asmlinkage void __init start_kernel(void) { char * command_line; @@ -441,6 +443,11 @@ asmlinkage void __init start_kernel(void smp_prepare_boot_cpu(); /* + * Save a copy of the baseline init_task in case we need to start + * another cpu scheduler. + */ + base_init_task = init_task; + /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. @@ -519,6 +526,7 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ + printk("Running with %s cpu scheduler.\n", scheduler->cpusched_name); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } Index: linux-2.6.10-rc1-mm5/kernel/Makefile =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/Makefile 2004-11-11 21:42:17.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/Makefile 2004-11-11 22:08:35.000000000 +1100 @@ -2,13 +2,16 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ +obj-y = scheduler.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o +obj-$(CONFIG_CPUSCHED_INGO) += sched.o +obj-$(CONFIG_CPUSCHED_STAIRCASE) += staircase.o +obj-$(CONFIG_CPUSCHED_MINISCHED) += minisched.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o Index: linux-2.6.10-rc1-mm5/kernel/minisched.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/minisched.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/minisched.c 2004-11-11 22:08:35.000000000 +1100 @@ -0,0 +1,960 @@ +/* + * kernel/minisched.c + * + * This is "minisched"; a minimalist uniprocessor scheduler. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_PRIO (MAX_RT_PRIO + 1) + +#define RR_INTERVAL (10 * HZ / 1000 ? : 1) + +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +static unsigned int task_timeslice(task_t *p) +{ + return NICE_TO_PRIO(p->static_prio) * RR_INTERVAL; +} + +typedef struct runqueue runqueue_t; + +/* + * This is the runqueue data structure. + */ +struct runqueue { + spinlock_t lock; + + unsigned long nr_running; + unsigned long long nr_switches; + unsigned long nr_uninterruptible; + task_t *curr, *idle; + struct mm_struct *prev_mm; + unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO+1)]; + struct list_head queue[MAX_PRIO + 1]; + atomic_t nr_iowait; +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +static runqueue_t *rq = &per_cpu(runqueues, 0); + +static int ms_rt_task(const task_t *p) +{ + return (unlikely((p)->u.mssched.prio < MAX_RT_PRIO)); +} + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +# define finish_arch_switch(next) spin_unlock_irq(&rq->lock) +# define task_running(p) (rq->curr == (p)) +#endif + +/* + * task_rq_lock - lock the runqueue and disable + * interrupts. + */ +static void task_rq_lock(unsigned long *flags) +{ + local_irq_save(*flags); + spin_lock(&rq->lock); +} + +static void task_rq_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock the runqueue and disable interrupts. + */ +static void rq_lock(void) +{ + local_irq_disable(); + spin_lock(&rq->lock); +} + +static int task_queued(task_t *task) +{ + return !list_empty(&task->u.mssched.run_list); +} + +/* + * Adding/removing a task to/from a runqueue: + */ +static void dequeue_task(struct task_struct *p) +{ + list_del_init(&p->u.mssched.run_list); + if (list_empty(rq->queue + p->u.mssched.prio)) + __clear_bit(p->u.mssched.prio, rq->bitmap); +} + +static void enqueue_task(struct task_struct *p) +{ + list_add_tail(&p->u.mssched.run_list, rq->queue + p->u.mssched.prio); + __set_bit(p->u.mssched.prio, rq->bitmap); +} + +static void requeue_task(struct task_struct *p) +{ + list_move_tail(&p->u.mssched.run_list, rq->queue + p->u.mssched.prio); +} + +static void ms_set_oom_timeslice(task_t *p) +{ + p->u.mssched.time_slice = HZ; +} + +static void __activate_task(task_t *p) +{ + enqueue_task(p); + rq->nr_running++; +} + +static void activate_task(task_t *p) +{ + p->u.mssched.time_slice = task_timeslice(p); + __activate_task(p); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p) +{ + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p); +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag; + */ +static inline void resched_task(task_t *p) +{ + set_tsk_need_resched(p); +} + +/** + * task_curr - is this task currently executing? + * @p: the task in question. + */ +static int ms_task_curr(const task_t *p) +{ + return (rq->curr == p); +} + +/* + * Check to see if p preempts rq->curr and resched if it only if it is a + * real time task. + */ +static void preempt(task_t *p) +{ + if (likely(p->u.mssched.prio == rq->curr->u.mssched.prio)) { + /* This is true for all non rt tasks */ + if (p->u.mssched.time_slice > rq->curr->u.mssched.time_slice) + /* This selects out higher priority normal tasks */ + resched_task(rq->curr); + goto out; + } + if (p->u.mssched.prio > rq->curr->u.mssched.prio) + /* + * This is a lower priority real time task or a normal task + * While a real time task is running. + */ + goto out; + resched_task(rq->curr); +out: + return; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int ms_try_to_wake_up(task_t * p, unsigned int state, int sync) +{ + int success = 0; + unsigned long flags; + long old_state; + + task_rq_lock(&flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (task_queued(p)) + goto out_running; + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p); + if (!sync) + preempt(p); + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(&flags); + + return success; +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void ms_sched_fork(task_t *p) +{ + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->u.mssched.run_list); + spin_lock_init(&p->switch_lock); +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#ifdef CONFIG_PREEMPT + /* + * During context-switch we hold precisely one spinlock, which + * schedule_tail drops. (in the common case it's rq->lock, + * but it also can be p->switch_lock.) So we compensate with a count + * of 1. Also, we want to start with kernel preemption disabled. + */ + p->thread_info->preempt_count = 1; +#endif +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void ms_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + + task_rq_lock(&flags); + + BUG_ON(p->state != TASK_RUNNING); + + __activate_task(p); + task_rq_unlock(&flags); +} + +static void ms_sched_exit(task_t * p) +{ +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(task_t *prev) +{ + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +static void ms_schedule_tail(task_t *prev) +{ + finish_task_switch(prev); + + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +static unsigned long ms_nr_running(void) +{ + return rq->nr_running; +} + +static unsigned long ms_nr_uninterruptible(void) +{ + return rq->nr_uninterruptible; +} + +static unsigned long long ms_nr_context_switches(void) +{ + return rq->nr_switches; +} + +static unsigned long ms_nr_iowait(void) +{ + return atomic_read(&rq->nr_iowait); +} + +static unsigned long ms_nr_iowait_task_cpu(const task_t *p) +{ + return atomic_read(&rq->nr_iowait); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +static void ms_scheduler_tick(void) +{ + task_t *p = current; + + if (p == rq->idle) + return; + + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(!task_queued(p))) { + set_tsk_need_resched(p); + return; + } + + /* + * SCHED_FIFO tasks never run out of timeslice. + */ + if (unlikely(p->policy == SCHED_FIFO)) + return; + + spin_lock(&rq->lock); + + if (!--p->u.mssched.time_slice) { + p->u.mssched.time_slice = task_timeslice(p); + set_tsk_need_resched(p); + requeue_task(p); + } + spin_unlock(&rq->lock); +} + +/* + * schedule() is the main scheduler function. + */ +static void __sched ms_schedule(void) +{ + long *switch_count; + task_t *prev, *next; + + struct list_head *queue; + int idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } + } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); + +need_resched_nonpreemptible: + spin_lock_irq(&rq->lock); + + if (unlikely(current->flags & PF_DEAD)) + current->state = EXIT_DEAD; + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else + deactivate_task(prev); + } + + if (unlikely(!rq->nr_running)) { + next = rq->idle; + goto switch_tasks; + } + + idx = sched_find_first_bit(rq->bitmap); + queue = rq->queue + idx; + next = list_entry(queue->next, task_t, u.mssched.run_list); + +switch_tasks: + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(next); + prev = context_switch(prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) + goto need_resched_nonpreemptible; + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +static void __sched ms_wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} + +static void ms_set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + task_rq_lock(&flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + + p->static_prio = NICE_TO_PRIO(nice); + +out_unlock: + task_rq_unlock(&flags); +} + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are all 0. + */ +static int ms_task_prio(const task_t *p) +{ + return p->u.mssched.prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +static int ms_task_nice(const task_t *p) +{ + return TASK_NICE(p); +} + +/** + * idle_cpu - is the cpu idle currently? + */ +static int ms_idle_cpu(int cpu) +{ + return rq->curr == rq->idle; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + BUG_ON(task_queued(p)); + p->policy = policy; + p->rt_priority = prio; + if (policy != SCHED_NORMAL) + p->u.mssched.prio = MAX_RT_PRIO - 1 - p->rt_priority; + else + p->u.mssched.prio = MAX_RT_PRIO; +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static int ms_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + int queued, oldprio, oldpolicy = -1; + unsigned long flags; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock; +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL) + goto out_unlock; + } + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) + goto out_unlock; + if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p, policy, &lp); + if (retval) + goto out_unlock; + /* + * To be able to change p->policy safely, the + * runqueue lock must be held. + */ + task_rq_lock(&flags); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(&flags); + goto recheck; + } + if ((queued = task_queued(p))) + deactivate_task(p); + retval = 0; + oldprio = p->u.mssched.prio; + __setscheduler(p, policy, lp.sched_priority); + if (queued) { + __activate_task(p); + /* + * Reschedule if we are currently running and + * our priority decreased, or if we are not currently running + * and our priority is higher than the current's + */ + if (task_running(p)) { + if (p->u.mssched.prio > oldprio) + resched_task(rq->curr); + } else + preempt(p); + } + task_rq_unlock(&flags); +out_unlock: + read_unlock_irq(&tasklist_lock); +out_nounlock: + return retval; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by dropping to end of the runqueue. + */ +static long ms_sys_sched_yield(void) +{ + task_t *p = current; + rq_lock(); + + set_tsk_need_resched(p); + requeue_task(current); + current->u.mssched.time_slice = task_timeslice(current); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +static void __sched ms_io_schedule(void) +{ + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +static long __sched ms_io_schedule_timeout(long timeout) +{ + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +static long +ms_sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static void __devinit ms_init_idle(task_t *idle, int cpu) +{ + unsigned long flags; + + idle->u.mssched.prio = MAX_RT_PRIO + 1; + idle->state = TASK_RUNNING; + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; + set_tsk_need_resched(idle); + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +static void __init ms_sched_init_smp(void) +{ +} + +static void __init ms_sched_init(void) +{ + int i; + + init_task.u.mssched.prio = MAX_RT_PRIO; + init_task.static_prio = MAX_RT_PRIO + 20; + INIT_LIST_HEAD(&init_task.u.mssched.run_list); + init_task.u.mssched.time_slice = HZ; + + spin_lock_init(&rq->lock); + + atomic_set(&rq->nr_iowait, 0); + for (i = 0; i <= MAX_PRIO; i++) + INIT_LIST_HEAD(&rq->queue[i]); + memset(rq->bitmap, 0, BITS_TO_LONGS(MAX_PRIO + 1)*sizeof(long)); + /* + * delimiter for bitsearch + */ + __set_bit(MAX_PRIO + 1, rq->bitmap); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, 0); +} + +static void ms_init_sched_domain_sysctl(void) +{ +} +static void ms_destroy_sched_domain_sysctl(void) +{ +} + +#ifdef CONFIG_MAGIC_SYSRQ +void ms_normalise_rt_tasks(void) +{ + struct task_struct *p; + unsigned long flags; + int queued; + + read_lock_irq(&tasklist_lock); + for_each_process (p) { + if (!rt_task(p)) + continue; + + task_rq_lock(&flags); + + if ((queued = task_queued(p))) + deactivate_task(p); + __setscheduler(p, SCHED_NORMAL, 0); + if (queued) { + __activate_task(p); + resched_task(rq->curr); + } + + task_rq_unlock(&flags); + } + read_unlock_irq(&tasklist_lock); +} +#endif + +#ifdef CONFIG_KGDB +static struct task_struct *ms_kgdb_get_idle(int this_cpu) +{ + return rq->idle; +} +#endif + +struct sched_drv ms_sched_drv = { + .task_cpu = common_task_cpu, + .set_task_cpu = common_set_task_cpu, + .init_sched_domain_sysctl = ms_init_sched_domain_sysctl, + .destroy_sched_domain_sysctl = ms_destroy_sched_domain_sysctl, + .cpusched_name = "minisched", + .rt_task = ms_rt_task, + .wait_for_completion = ms_wait_for_completion, + .io_schedule = ms_io_schedule, + .io_schedule_timeout = ms_io_schedule_timeout, + .set_oom_timeslice = ms_set_oom_timeslice, + .nr_running = ms_nr_running, + .nr_uninterruptible = ms_nr_uninterruptible, + .nr_context_switches = ms_nr_context_switches, + .nr_iowait = ms_nr_iowait, + .nr_iowait_task_cpu = ms_nr_iowait_task_cpu, + .idle_cpu = ms_idle_cpu, + .init_idle = ms_init_idle, + .exit = ms_sched_exit, + .fork = ms_sched_fork, + .init = ms_sched_init, + .init_smp = ms_sched_init_smp, + .schedule = ms_schedule, + .tick = ms_scheduler_tick, + .tail = ms_schedule_tail, + .setscheduler = ms_setscheduler, + .set_user_nice = ms_set_user_nice, + .rr_get_interval = ms_sys_sched_rr_get_interval, + .yield = ms_sys_sched_yield, + .task_curr = ms_task_curr, + .task_nice = ms_task_nice, + .task_prio = ms_task_prio, + .try_to_wake_up = ms_try_to_wake_up, + .wake_up_new_task = ms_wake_up_new_task, +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_tasks = ms_normalise_rt_tasks, +#endif +#ifdef CONFIG_KGDB + .kgdb_get_idle = ms_kgdb_get_idle, +#endif +}; Index: linux-2.6.10-rc1-mm5/kernel/sched.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/sched.c 2004-11-11 21:42:18.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/sched.c 2004-11-11 22:08:34.000000000 +1100 @@ -1,7 +1,7 @@ /* * kernel/sched.c * - * Kernel scheduler and related syscalls + * This is "ingosched"; the default cpu scheduler. * * Copyright (C) 1991-2002 Linus Torvalds * @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,15 @@ #endif /* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->prio value means higher priority. + */ + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -130,7 +140,7 @@ */ #define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + (NS_TO_JIFFIES((p)->u.ingosched.sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG) #define GRANULARITY (10 * HZ / 1000 ? : 1) @@ -151,14 +161,14 @@ (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) + ((p)->u.ingosched.prio <= (p)->static_prio - DELTA(p)) #define INTERACTIVE_SLEEP(p) \ (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) #define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) + ((p)->u.ingosched.prio < (rq)->curr->u.ingosched.prio) /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] @@ -179,7 +189,7 @@ static unsigned int task_timeslice(task_ else return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); } -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ +#define task_hot(p, now, sd) ((long long) ((now) - (p)->u.ingosched.last_ran) \ < (long long) (sd)->cache_hot_time) /* @@ -235,45 +245,7 @@ struct runqueue { #endif #ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - - /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; - unsigned long yld_cnt; - - /* schedule() stats */ - unsigned long sched_noswitch; - unsigned long sched_switch; - unsigned long sched_cnt; - unsigned long sched_goidle; - - /* pull_task() stats */ - unsigned long pt_gained[MAX_IDLE_TYPES]; - unsigned long pt_lost[MAX_IDLE_TYPES]; - - /* active_load_balance() stats */ - unsigned long alb_cnt; - unsigned long alb_lost; - unsigned long alb_gained; - unsigned long alb_failed; - - /* try_to_wake_up() stats */ - unsigned long ttwu_cnt; - unsigned long ttwu_attempts; - unsigned long ttwu_moved; - - /* wake_up_new_task() stats */ - unsigned long wunt_cnt; - unsigned long wunt_moved; - - /* sched_migrate_task() stats */ - unsigned long smt_cnt; - - /* sched_balance_exec() stats */ - unsigned long sbe_cnt; + schedstat_pcd_t *sspcd; #endif }; @@ -287,6 +259,11 @@ static DEFINE_PER_CPU(struct runqueue, r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static int ingo_rt_task(const task_t *p) +{ + return (unlikely((p)->u.ingosched.prio < MAX_RT_PRIO)); +} + /* * Default context-switch locking: */ @@ -323,105 +300,6 @@ static inline void task_rq_unlock(runque spin_unlock_irqrestore(&rq->lock, *flags); } -#ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 10 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - enum idle_type itype; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - runqueue_t *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcnt = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " - "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, - rq->yld_cnt, rq->sched_noswitch, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->alb_cnt, rq->alb_gained, rq->alb_lost, - rq->alb_failed, - rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts, - rq->wunt_cnt, rq->wunt_moved, - rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) - seq_printf(seq, " %lu %lu", rq->pt_gained[itype], - rq->pt_lost[itype]); - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - for_each_domain(cpu, sd) { - char mask_str[NR_CPUS]; - - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu", - sd->lb_cnt[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, " %lu %lu %lu %lu\n", - sd->sbe_pushed, sd->sbe_attempts, - sd->ttwu_wake_affine, sd->ttwu_wake_balance); - } -#endif - } - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -# define schedstat_inc(rq, field) rq->field++; -# define schedstat_add(rq, field, amt) rq->field += amt; -#else /* !CONFIG_SCHEDSTATS */ -# define schedstat_inc(rq, field) do { } while (0); -# define schedstat_add(rq, field, amt) do { } while (0); -#endif - /* * rq_lock - lock a given runqueue and disable interrupts. */ @@ -459,130 +337,24 @@ static int cpu_and_siblings_are_idle(int #define cpu_and_siblings_are_idle(A) idle_cpu(A) #endif -#ifdef CONFIG_SCHEDSTATS -/* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. - */ -static inline void sched_info_dequeued(task_t *t) -{ - t->sched_info.last_queued = 0; -} - -/* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. - */ -static inline void sched_info_arrive(task_t *t) -{ - unsigned long now = jiffies, diff = 0; - struct runqueue *rq = task_rq(t); - - if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += diff; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; - - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; -} - -/* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. - */ -static inline void sched_info_queued(task_t *t) -{ - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; -} - -/* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. - */ -static inline void sched_info_depart(task_t *t) -{ - struct runqueue *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; - - t->sched_info.cpu_time += diff; - - if (rq) - rq->rq_sched_info.cpu_time += diff; -} - -/* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. - */ -static inline void sched_info_switch(task_t *prev, task_t *next) -{ - struct runqueue *rq = task_rq(prev); - - /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. - */ - if (prev != rq->idle) - sched_info_depart(prev); - - if (next != rq->idle) - sched_info_arrive(next); -} -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ - /* * Adding/removing a task to/from a priority array: */ static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + list_del(&p->u.ingosched.run_list); + if (list_empty(array->queue + p->u.ingosched.prio)) + __clear_bit(p->u.ingosched.prio, array->bitmap); } static void enqueue_task(struct task_struct *p, prio_array_t *array) { sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); + list_add_tail(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); + __set_bit(p->u.ingosched.prio, array->bitmap); array->nr_active++; - p->array = array; + p->u.ingosched.array = array; } /* @@ -591,7 +363,7 @@ static void enqueue_task(struct task_str */ static void requeue_task(struct task_struct *p, prio_array_t *array) { - list_move_tail(&p->run_list, array->queue + p->prio); + list_move_tail(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); } /* @@ -601,10 +373,15 @@ static void requeue_task(struct task_str */ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); + list_add(&p->u.ingosched.run_list, array->queue + p->u.ingosched.prio); + __set_bit(p->u.ingosched.prio, array->bitmap); array->nr_active++; - p->array = array; + p->u.ingosched.array = array; +} + +static void ingo_set_oom_timeslice(task_t *p) +{ + p->u.ingosched.time_slice = HZ; } /* @@ -626,7 +403,7 @@ static int effective_prio(task_t *p) int bonus, prio; if (rt_task(p)) - return p->prio; + return p->u.ingosched.prio; bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; @@ -658,7 +435,7 @@ static inline void __activate_idle_task( static void recalc_task_prio(task_t *p, unsigned long long now) { - unsigned long long __sleep_time = now - p->timestamp; + unsigned long long __sleep_time = now - p->u.ingosched.timestamp; unsigned long sleep_time; if (__sleep_time > NS_MAX_SLEEP_AVG) @@ -673,9 +450,9 @@ static void recalc_task_prio(task_t *p, * prevent them suddenly becoming cpu hogs and starving * other processes. */ - if (p->mm && p->activated != -1 && + if (p->mm && p->u.ingosched.activated != -1 && sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + p->u.ingosched.sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - DEF_TIMESLICE); } else { /* @@ -689,12 +466,12 @@ static void recalc_task_prio(task_t *p, * limited in their sleep_avg rise as they * are likely to be waiting on I/O */ - if (p->activated == -1 && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + if (p->u.ingosched.activated == -1 && p->mm) { + if (p->u.ingosched.sleep_avg >= INTERACTIVE_SLEEP(p)) sleep_time = 0; - else if (p->sleep_avg + sleep_time >= + else if (p->u.ingosched.sleep_avg + sleep_time >= INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); + p->u.ingosched.sleep_avg = INTERACTIVE_SLEEP(p); sleep_time = 0; } } @@ -707,14 +484,14 @@ static void recalc_task_prio(task_t *p, * task spends sleeping, the higher the average gets - * and the higher the priority boost gets as well. */ - p->sleep_avg += sleep_time; + p->u.ingosched.sleep_avg += sleep_time; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; + if (p->u.ingosched.sleep_avg > NS_MAX_SLEEP_AVG) + p->u.ingosched.sleep_avg = NS_MAX_SLEEP_AVG; } } - p->prio = effective_prio(p); + p->u.ingosched.prio = effective_prio(p); } /* @@ -743,7 +520,7 @@ static void activate_task(task_t *p, run * This checks to make sure it's not an uninterruptible task * that is now waking up. */ - if (!p->activated) { + if (!p->u.ingosched.activated) { /* * Tasks which were woken up by interrupts (ie. hw events) * are most likely of interactive nature. So we give them @@ -752,16 +529,16 @@ static void activate_task(task_t *p, run * on a CPU, first time around: */ if (in_interrupt()) - p->activated = 2; + p->u.ingosched.activated = 2; else { /* * Normal first-time wakeups get a credit too for * on-runqueue time, but it will be weighted down: */ - p->activated = 1; + p->u.ingosched.activated = 1; } } - p->timestamp = now; + p->u.ingosched.timestamp = now; __activate_task(p, rq); } @@ -774,8 +551,8 @@ static void deactivate_task(struct task_ rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(p, p->array); - p->array = NULL; + dequeue_task(p, p->u.ingosched.array); + p->u.ingosched.array = NULL; } /* @@ -811,7 +588,7 @@ static inline void resched_task(task_t * * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ -inline int task_curr(const task_t *p) +static int ingo_task_curr(const task_t *p) { return cpu_curr(task_cpu(p)) == p; } @@ -848,7 +625,7 @@ static int migrate_task(task_t *p, int d * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->u.ingosched.array && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -870,7 +647,7 @@ static int migrate_task(task_t *p, int d * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +static void ingo_wait_task_inactive(task_t * p) { unsigned long flags; runqueue_t *rq; @@ -879,7 +656,7 @@ void wait_task_inactive(task_t * p) repeat: rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array)) { + if (unlikely(p->u.ingosched.array)) { /* If it's preempted, we yield. It could be a while. */ preempted = !task_running(rq, p); task_rq_unlock(rq, &flags); @@ -891,24 +668,6 @@ repeat: task_rq_unlock(rq, &flags); } -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - */ -void kick_process(task_t *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} - /* * Return a low guess at the load of a migration-source cpu. * @@ -988,7 +747,7 @@ static inline int wake_idle(int cpu, tas * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int ingo_try_to_wake_up(task_t * p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1001,12 +760,12 @@ static int try_to_wake_up(task_t * p, un #endif old_rq = rq = task_rq_lock(p, &flags); - schedstat_inc(rq, ttwu_cnt); + schedstat_inc(rq->sspcd, ttwu_cnt); old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->u.ingosched.array) goto out_running; cpu = task_cpu(p); @@ -1074,10 +833,10 @@ static int try_to_wake_up(task_t * p, un new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: - schedstat_inc(rq, ttwu_attempts); + schedstat_inc(rq->sspcd, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu) { - schedstat_inc(rq, ttwu_moved); + schedstat_inc(rq->sspcd, ttwu_moved); set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ @@ -1085,7 +844,7 @@ out_set_cpu: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->u.ingosched.array) goto out_running; this_cpu = smp_processor_id(); @@ -1100,7 +859,7 @@ out_activate: * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. */ - p->activated = -1; + p->u.ingosched.activated = -1; } /* @@ -1126,19 +885,6 @@ out: return success; } -int fastcall wake_up_process(task_t * p) -{ - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); -} - -EXPORT_SYMBOL(wake_up_process); - -int fastcall wake_up_state(task_t *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - #ifdef CONFIG_SMP static int find_idlest_cpu(struct task_struct *p, int this_cpu, struct sched_domain *sd); @@ -1148,7 +894,7 @@ static int find_idlest_cpu(struct task_s * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p) +static void ingo_sched_fork(task_t *p) { /* * We mark the process as running here, but have not actually @@ -1157,8 +903,8 @@ void fastcall sched_fork(task_t *p) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; - INIT_LIST_HEAD(&p->run_list); - p->array = NULL; + INIT_LIST_HEAD(&p->u.ingosched.run_list); + p->u.ingosched.array = NULL; spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -1178,21 +924,21 @@ void fastcall sched_fork(task_t *p) * resulting in more scheduling fairness. */ local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; + p->u.ingosched.time_slice = (current->u.ingosched.time_slice + 1) >> 1; /* * The remainder of the first timeslice might be recovered by * the parent if the child exits early enough. */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { + p->u.ingosched.first_time_slice = 1; + current->u.ingosched.time_slice >>= 1; + p->u.ingosched.timestamp = sched_clock(); + if (unlikely(!current->u.ingosched.time_slice)) { /* * This case is rare, it happens when the parent has only * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ - current->time_slice = 1; + current->u.ingosched.time_slice = 1; preempt_disable(); scheduler_tick(); local_irq_enable(); @@ -1208,7 +954,7 @@ void fastcall sched_fork(task_t *p) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +static void ingo_wake_up_new_task(task_t * p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1220,17 +966,17 @@ void fastcall wake_up_new_task(task_t * BUG_ON(p->state != TASK_RUNNING); - schedstat_inc(rq, wunt_cnt); + schedstat_inc(rq->sspcd, wunt_cnt); /* * We decrease the sleep average of forking parents * and children as well, to keep max-interactive tasks * from forking tasks that are max-interactive. The parent * (current) is done further down, under its lock. */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + p->u.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->prio = effective_prio(p); + p->u.ingosched.prio = effective_prio(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { @@ -1239,13 +985,13 @@ void fastcall wake_up_new_task(task_t * * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) + if (unlikely(!current->u.ingosched.array)) __activate_task(p, rq); else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; + p->u.ingosched.prio = current->u.ingosched.prio; + list_add_tail(&p->u.ingosched.run_list, ¤t->u.ingosched.run_list); + p->u.ingosched.array = current->u.ingosched.array; + p->u.ingosched.array->nr_active++; rq->nr_running++; } set_need_resched(); @@ -1266,21 +1012,21 @@ void fastcall wake_up_new_task(task_t * * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + p->u.ingosched.timestamp = (p->u.ingosched.timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); - schedstat_inc(rq, wunt_moved); + schedstat_inc(rq->sspcd, wunt_moved); /* * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: + * parent runqueue to update the parent's ->u.ingosched.sleep_avg: */ task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + current->u.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); task_rq_unlock(this_rq, &flags); } @@ -1294,7 +1040,7 @@ void fastcall wake_up_new_task(task_t * * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +static void ingo_sched_exit(task_t * p) { unsigned long flags; runqueue_t *rq; @@ -1304,14 +1050,14 @@ void fastcall sched_exit(task_t * p) * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + if (p->u.ingosched.first_time_slice) { + p->parent->u.ingosched.time_slice += p->u.ingosched.time_slice; + if (unlikely(p->parent->u.ingosched.time_slice > task_timeslice(p))) + p->parent->u.ingosched.time_slice = task_timeslice(p); + } + if (p->u.ingosched.sleep_avg < p->parent->u.ingosched.sleep_avg) + p->parent->u.ingosched.sleep_avg = p->parent->u.ingosched.sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->u.ingosched.sleep_avg / (EXIT_WEIGHT + 1); task_rq_unlock(rq, &flags); } @@ -1361,7 +1107,7 @@ static void finish_task_switch(task_t *p * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(task_t *prev) +static void ingo_schedule_tail(task_t *prev) __releases(rq->lock) { finish_task_switch(prev); @@ -1406,7 +1152,7 @@ task_t * context_switch(runqueue_t *rq, * threads, current number of uninterruptible-sleeping threads, total * number of context switches performed since bootup. */ -unsigned long nr_running(void) +static unsigned long ingo_nr_running(void) { unsigned long i, sum = 0; @@ -1416,7 +1162,7 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) +static unsigned long ingo_nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -1426,7 +1172,7 @@ unsigned long nr_uninterruptible(void) return sum; } -unsigned long long nr_context_switches(void) +static unsigned long long ingo_nr_context_switches(void) { unsigned long long i, sum = 0; @@ -1436,7 +1182,7 @@ unsigned long long nr_context_switches(v return sum; } -unsigned long nr_iowait(void) +static unsigned long ingo_nr_iowait(void) { unsigned long i, sum = 0; @@ -1446,6 +1192,11 @@ unsigned long nr_iowait(void) return sum; } +static unsigned long ingo_nr_iowait_task_cpu(const task_t *p) +{ + return atomic_read(&task_rq(p)->nr_iowait); +} + #ifdef CONFIG_SMP /* @@ -1569,7 +1320,7 @@ static void sched_migrate_task(task_t *p || unlikely(cpu_is_offline(dest_cpu))) goto out; - schedstat_inc(rq, smt_cnt); + schedstat_inc(rq->sspcd, smt_cnt); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -1592,12 +1343,12 @@ out: * execve() is a valuable balancing opportunity, because at this point * the task has the smallest effective memory and cache footprint. */ -void sched_exec(void) +static void ingo_sched_exec(void) { struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - schedstat_inc(this_rq(), sbe_cnt); + schedstat_inc(this_rq()->sspcd, sbe_cnt); /* Prefer the current CPU if there's only this task running */ if (this_rq()->nr_running <= 1) goto out; @@ -1633,7 +1384,7 @@ void pull_task(runqueue_t *src_rq, prio_ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + p->u.ingosched.timestamp = (p->u.ingosched.timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of MAX_PRIO, for this test @@ -1729,7 +1480,7 @@ skip_bitmap: head = array->queue + idx; curr = head->prev; skip_queue: - tmp = list_entry(curr, task_t, run_list); + tmp = list_entry(curr, task_t, u.ingosched.run_list); curr = curr->prev; @@ -1745,8 +1496,8 @@ skip_queue: * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ - schedstat_inc(this_rq, pt_gained[idle]); - schedstat_inc(busiest, pt_lost[idle]); + schedstat_inc(this_rq->sspcd, pt_gained[idle]); + schedstat_inc(busiest->sspcd, pt_lost[idle]); pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; @@ -2103,7 +1854,7 @@ static void active_load_balance(runqueue cpumask_t visited_cpus; int cpu; - schedstat_inc(busiest_rq, alb_cnt); + schedstat_inc(busiest_rq->sspcd, alb_cnt); /* * Search for suitable CPUs to push tasks to in successively higher * domains with SD_LOAD_BALANCE set. @@ -2138,10 +1889,10 @@ static void active_load_balance(runqueue double_lock_balance(busiest_rq, target_rq); if (move_tasks(target_rq, cpu, busiest_rq, 1, sd, SCHED_IDLE)) { - schedstat_inc(busiest_rq, alb_lost); - schedstat_inc(target_rq, alb_gained); + schedstat_inc(busiest_rq->sspcd, alb_lost); + schedstat_inc(target_rq->sspcd, alb_gained); } else { - schedstat_inc(busiest_rq, alb_failed); + schedstat_inc(busiest_rq->sspcd, alb_failed); } spin_unlock(&target_rq->lock); } @@ -2235,10 +1986,6 @@ static inline int wake_priority_sleeper( return ret; } -DEFINE_PER_CPU(struct kernel_stat, kstat); - -EXPORT_PER_CPU_SYMBOL(kstat); - /* * We place interactive tasks back into the active array, if possible. * @@ -2256,155 +2003,13 @@ EXPORT_PER_CPU_SYMBOL(kstat); ((rq)->curr->static_prio > (rq)->best_expired_prio)) /* - * Do the virtual cpu time signal calculations. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -static inline void account_it_virt(struct task_struct * p, cputime_t cputime) -{ - cputime_t it_virt = p->it_virt_value; - - if (cputime_gt(it_virt, cputime_zero) && - cputime_gt(cputime, cputime_zero)) { - if (cputime_ge(cputime, it_virt)) { - it_virt = cputime_add(it_virt, p->it_virt_incr); - send_sig(SIGVTALRM, p, 1); - } - it_virt = cputime_sub(it_virt, cputime); - p->it_virt_value = it_virt; - } -} - -/* - * Do the virtual profiling signal calculations. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user and kernel space since the last update - */ -static void account_it_prof(struct task_struct *p, cputime_t cputime) -{ - cputime_t it_prof = p->it_prof_value; - - if (cputime_gt(it_prof, cputime_zero) && - cputime_gt(cputime, cputime_zero)) { - if (cputime_ge(cputime, it_prof)) { - it_prof = cputime_add(it_prof, p->it_prof_incr); - send_sig(SIGPROF, p, 1); - } - it_prof = cputime_sub(it_prof, cputime); - p->it_prof_value = it_prof; - } -} - -/* - * Check if the process went over its cputime resource limit after - * some cpu time got added to utime/stime. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user and kernel space since the last update - */ -static void check_rlimit(struct task_struct *p, cputime_t cputime) -{ - cputime_t total, tmp; - - total = cputime_add(p->utime, p->stime); - tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_cur); - if (unlikely(cputime_gt(total, tmp))) { - /* Send SIGXCPU every second. */ - tmp = cputime_sub(total, cputime); - if (cputime_to_secs(tmp) < cputime_to_secs(total)) - send_sig(SIGXCPU, p, 1); - /* and SIGKILL when we go over max.. */ - tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max); - if (cputime_gt(total, tmp)) - send_sig(SIGKILL, p, 1); - } -} - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time(struct task_struct *p, cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; - - p->utime = cputime_add(p->utime, cputime); - - /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */ - if (likely(p->signal)) - check_rlimit(p, cputime); - account_it_virt(p, cputime); - account_it_prof(p, cputime); - - /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - runqueue_t *rq = this_rq(); - cputime64_t tmp; - - p->stime = cputime_add(p->stime, cputime); - - /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */ - if (likely(p->signal)) - check_rlimit(p, cputime); - account_it_prof(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); -} - -/* - * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(struct task_struct *p, cputime_t steal) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t steal64 = cputime_to_cputime64(steal); - runqueue_t *rq = this_rq(); - - if (p == rq->idle) - cpustat->system = cputime64_add(cpustat->system, steal64); - else - cpustat->steal = cputime64_add(cpustat->steal, steal64); -} - -/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * * It also gets called by the fork code, when changing the parent's * timeslices. */ -void scheduler_tick(void) +static void ingo_scheduler_tick(void) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); @@ -2420,7 +2025,7 @@ void scheduler_tick(void) } /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->u.ingosched.array != rq->active) { set_tsk_need_resched(p); goto out; } @@ -2437,9 +2042,9 @@ void scheduler_tick(void) * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + if ((p->policy == SCHED_RR) && !--p->u.ingosched.time_slice) { + p->u.ingosched.time_slice = task_timeslice(p); + p->u.ingosched.first_time_slice = 0; set_tsk_need_resched(p); /* put it at the end of the queue: */ @@ -2447,12 +2052,12 @@ void scheduler_tick(void) } goto out_unlock; } - if (!--p->time_slice) { + if (!--p->u.ingosched.time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + p->u.ingosched.prio = effective_prio(p); + p->u.ingosched.time_slice = task_timeslice(p); + p->u.ingosched.first_time_slice = 0; if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; @@ -2480,9 +2085,9 @@ void scheduler_tick(void) * delta range with at least TIMESLICE_GRANULARITY to requeue. */ if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { + p->u.ingosched.time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->u.ingosched.time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->u.ingosched.array == rq->active)) { requeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2573,7 +2178,7 @@ static inline int dependent_sleeper(int BUG_ON(!array->nr_active); p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, - task_t, run_list); + task_t, u.ingosched.run_list); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); @@ -2587,7 +2192,7 @@ static inline int dependent_sleeper(int * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + if (((smt_curr->u.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) ret = 1; @@ -2597,7 +2202,7 @@ static inline int dependent_sleeper(int * or wake it up if it has been put to sleep for priority * reasons. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + if ((((p->u.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(smt_curr) || rt_task(p)) && smt_curr->mm && p->mm && !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) @@ -2619,42 +2224,10 @@ static inline int dependent_sleeper(int } #endif -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) - -void fastcall add_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON(((int)preempt_count() < 0)); - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); -} -EXPORT_SYMBOL(add_preempt_count); - -void fastcall sub_preempt_count(int val) -{ - /* - * Underflow? - */ - BUG_ON(val > preempt_count()); - /* - * Is the spinlock portion underflowing? - */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched ingo_schedule(void) { long *switch_count; task_t *prev, *next; @@ -2696,10 +2269,10 @@ need_resched_nonpreemptible: dump_stack(); } - schedstat_inc(rq, sched_cnt); + schedstat_inc(rq->sspcd, sched_cnt); now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; + if (likely(now - prev->u.ingosched.timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->u.ingosched.timestamp; else run_time = NS_MAX_SLEEP_AVG; @@ -2762,46 +2335,46 @@ go_idle: /* * Switch the active and expired arrays. */ - schedstat_inc(rq, sched_switch); + schedstat_inc(rq->sspcd, sched_switch); rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; rq->best_expired_prio = MAX_PRIO; } else - schedstat_inc(rq, sched_noswitch); + schedstat_inc(rq->sspcd, sched_noswitch); idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); + next = list_entry(queue->next, task_t, u.ingosched.run_list); - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; + if (!rt_task(next) && next->u.ingosched.activated > 0) { + unsigned long long delta = now - next->u.ingosched.timestamp; - if (next->activated == 1) + if (next->u.ingosched.activated == 1) delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - array = next->array; + array = next->u.ingosched.array; dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); + recalc_task_prio(next, next->u.ingosched.timestamp + delta); enqueue_task(next, array); } - next->activated = 0; + next->u.ingosched.activated = 0; switch_tasks: if (next == rq->idle) - schedstat_inc(rq, sched_goidle); + schedstat_inc(rq->sspcd, sched_goidle); prefetch(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; + prev->u.ingosched.sleep_avg -= run_time; + if ((long)prev->u.ingosched.sleep_avg <= 0) + prev->u.ingosched.sleep_avg = 0; + prev->u.ingosched.timestamp = prev->u.ingosched.last_ran = now; sched_info_switch(prev, next); if (likely(prev != next)) { - next->timestamp = now; + next->u.ingosched.timestamp = now; rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -2822,169 +2395,7 @@ switch_tasks: goto need_resched; } -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_PREEMPT -/* - * this is is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage void __sched preempt_schedule(void) -{ - struct thread_info *ti = current_thread_info(); -#ifdef CONFIG_PREEMPT_BKL - struct task_struct *task = current; - int saved_lock_depth; -#endif - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (unlikely(ti->preempt_count || irqs_disabled())) - return; - -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ -#ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; -#endif - schedule(); -#ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; -#endif - sub_preempt_count(PREEMPT_ACTIVE); - - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; -} - -EXPORT_SYMBOL(preempt_schedule); -#endif /* CONFIG_PREEMPT */ - -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) -{ - task_t *p = curr->task; - return try_to_wake_up(p, mode, sync); -} - -EXPORT_SYMBOL(default_wake_function); - -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) -{ - struct list_head *tmp, *next; - - list_for_each_safe(tmp, next, &q->task_list) { - wait_queue_t *curr; - unsigned flags; - curr = list_entry(tmp, wait_queue_t, task_list); - flags = curr->flags; - if (curr->func(curr, mode, sync, key) && - (flags & WQ_FLAG_EXCLUSIVE) && - !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - */ -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} - -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) -{ - __wake_up_common(q, mode, 1, 0, NULL); -} - -/** - * __wake_up - sync- wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - unsigned long flags; - int sync = 1; - - if (unlikely(!q)) - return; - - if (unlikely(!nr_exclusive)) - sync = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -void fastcall complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -void fastcall complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -void fastcall __sched wait_for_completion(struct completion *x) +static void __sched ingo_wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -3004,85 +2415,13 @@ void fastcall __sched wait_for_completio x->done--; spin_unlock_irq(&x->wait.lock); } -EXPORT_SYMBOL(wait_for_completion); - -#define SLEEP_ON_VAR \ - unsigned long flags; \ - wait_queue_t wait; \ - init_waitqueue_entry(&wait, current); - -#define SLEEP_ON_HEAD \ - spin_lock_irqsave(&q->lock,flags); \ - __add_wait_queue(q, &wait); \ - spin_unlock(&q->lock); - -#define SLEEP_ON_TAIL \ - spin_lock_irq(&q->lock); \ - __remove_wait_queue(q, &wait); \ - spin_unlock_irqrestore(&q->lock, flags); - -void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} - -EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +static void ingo_set_user_nice(task_t *p, long nice) { - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; -} - -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void fastcall __sched sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} - -EXPORT_SYMBOL(sleep_on); - -long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; -} - -EXPORT_SYMBOL(sleep_on_timeout); - -void set_user_nice(task_t *p, long nice) -{ - unsigned long flags; - prio_array_t *array; - runqueue_t *rq; - int old_prio, new_prio, delta; + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3101,15 +2440,15 @@ void set_user_nice(task_t *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; + array = p->u.ingosched.array; if (array) dequeue_task(p, array); - old_prio = p->prio; + old_prio = p->u.ingosched.prio; new_prio = NICE_TO_PRIO(nice); delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; + p->u.ingosched.prio += delta; if (array) { enqueue_task(p, array); @@ -3124,59 +2463,13 @@ out_unlock: task_rq_unlock(rq, &flags); } -EXPORT_SYMBOL(set_user_nice); - #ifdef CONFIG_KGDB -struct task_struct *kgdb_get_idle(int this_cpu) +static struct task_struct *ingo_kgdb_get_idle(int this_cpu) { return cpu_rq(this_cpu)->idle; } #endif -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -asmlinkage long sys_nice(int increment) -{ - int retval; - long nice; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - if (increment < 0) { - if (!capable(CAP_SYS_NICE)) - return -EPERM; - if (increment < -40) - increment = -40; - } - if (increment > 40) - increment = 40; - - nice = PRIO_TO_NICE(current->static_prio) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - /** * task_prio - return the priority value of a given task. * @p: the task in question. @@ -3185,16 +2478,16 @@ asmlinkage long sys_nice(int increment) * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ -int task_prio(const task_t *p) +static int ingo_task_prio(const task_t *p) { - return p->prio - MAX_RT_PRIO; + return p->u.ingosched.prio - MAX_RT_PRIO; } /** * task_nice - return the nice value of a given task. * @p: the task in question. */ -int task_nice(const task_t *p) +static int ingo_task_nice(const task_t *p) { return TASK_NICE(p); } @@ -3203,38 +2496,27 @@ int task_nice(const task_t *p) * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. */ -int idle_cpu(int cpu) +static int ingo_idle_cpu(int cpu) { return cpu_curr(cpu) == cpu_rq(cpu)->idle; } -EXPORT_SYMBOL_GPL(idle_cpu); - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - */ -static inline task_t *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_pid(pid) : current; -} - /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(p->u.ingosched.array); p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + p->u.ingosched.prio = MAX_USER_RT_PRIO-1 - p->rt_priority; else - p->prio = p->static_prio; + p->u.ingosched.prio = p->static_prio; } /* * setscheduler - change the scheduling policy and/or RT priority of a thread. */ -static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int ingo_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lp; int retval = -EINVAL; @@ -3303,11 +2585,11 @@ recheck: task_rq_unlock(rq, &flags); goto recheck; } - array = p->array; + array = p->u.ingosched.array; if (array) deactivate_task(p, task_rq(p)); retval = 0; - oldprio = p->prio; + oldprio = p->u.ingosched.prio; __setscheduler(p, policy, lp.sched_priority); if (array) { __activate_task(p, task_rq(p)); @@ -3317,7 +2599,7 @@ recheck: * this runqueue and our priority is higher than the current's */ if (task_running(rq, p)) { - if (p->prio > oldprio) + if (p->u.ingosched.prio > oldprio) resched_task(rq->curr); } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -3330,241 +2612,19 @@ out_nounlock: } /** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, - struct sched_param __user *param) -{ - return setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - */ -asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) -{ - return setscheduler(pid, -1, param); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - */ -asmlinkage long sys_sched_getscheduler(pid_t pid) -{ - int retval = -EINVAL; - task_t *p; - - if (pid < 0) - goto out_nounlock; - - retval = -ESRCH; - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy; - } - read_unlock(&tasklist_lock); - -out_nounlock: - return retval; -} - -/** - * sys_sched_getscheduler - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - */ -asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) -{ - struct sched_param lp; - int retval = -EINVAL; - task_t *p; - - if (!param || pid < 0) - goto out_nounlock; - - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - -out_nounlock: - return retval; - -out_unlock: - read_unlock(&tasklist_lock); - return retval; -} - -long sched_setaffinity(pid_t pid, cpumask_t new_mask) -{ - task_t *p; - int retval; - cpumask_t cpus_allowed; - - lock_cpu_hotplug(); - read_lock(&tasklist_lock); - - p = find_process_by_pid(pid); - if (!p) { - read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); - return -ESRCH; - } - - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ - get_task_struct(p); - read_unlock(&tasklist_lock); - - retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) - goto out_unlock; - - cpus_allowed = cpuset_cpus_allowed(p); - cpus_and(new_mask, new_mask, cpus_allowed); - retval = set_cpus_allowed(p, new_mask); - -out_unlock: - put_task_struct(p); - unlock_cpu_hotplug(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) -{ - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - */ -asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) -{ - cpumask_t new_mask; - int retval; - - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; - - return sched_setaffinity(pid, new_mask); -} - -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map = CPU_MASK_ALL; -cpumask_t cpu_possible_map = CPU_MASK_ALL; -#endif - -long sched_getaffinity(pid_t pid, cpumask_t *mask) -{ - int retval; - task_t *p; - - lock_cpu_hotplug(); - read_lock(&tasklist_lock); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = 0; - cpus_and(*mask, p->cpus_allowed, cpu_possible_map); - -out_unlock: - read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); - if (retval) - return retval; - - return 0; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - */ -asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) -{ - int ret; - cpumask_t mask; - - if (len < sizeof(cpumask_t)) - return -EINVAL; - - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; - - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; - - return sizeof(cpumask_t); -} - -/** * sys_sched_yield - yield the current processor to other threads. * * this function yields the current CPU by moving the calling thread * to the expired array. If there are no other threads running on this * CPU then this function will return. */ -asmlinkage long sys_sched_yield(void) +static long ingo_sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; + prio_array_t *array = current->u.ingosched.array; prio_array_t *target = rq->expired; - schedstat_inc(rq, yld_cnt); + schedstat_inc(rq->sspcd, yld_cnt); /* * We implement yielding by moving the task into the expired * queue. @@ -3575,12 +2635,12 @@ asmlinkage long sys_sched_yield(void) if (rt_task(current)) target = rq->active; - if (current->array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); + if (current->u.ingosched.array->nr_active == 1) { + schedstat_inc(rq->sspcd, yld_act_empty); if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); + schedstat_inc(rq->sspcd, yld_both_empty); } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); + schedstat_inc(rq->sspcd, yld_exp_empty); if (array != target) { dequeue_task(current, array); @@ -3604,86 +2664,6 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline void __cond_resched(void) -{ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); -} - -int __sched cond_resched(void) -{ - if (need_resched()) { - __cond_resched(); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched); - -/* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int cond_resched_lock(spinlock_t * lock) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) - if (lock->break_lock) { - lock->break_lock = 0; - spin_unlock(lock); - cpu_relax(); - spin_lock(lock); - } -#endif - if (need_resched()) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); - spin_lock(lock); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched_lock); - -int __sched cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (need_resched()) { - __local_bh_enable(); - __cond_resched(); - local_bh_disable(); - return 1; - } - return 0; -} - -EXPORT_SYMBOL(cond_resched_softirq); - - -/** - * yield - yield the current processor to other threads. - * - * this is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} - -EXPORT_SYMBOL(yield); - /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. @@ -3691,7 +2671,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void __sched io_schedule(void) +static void __sched ingo_io_schedule(void) { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); @@ -3700,9 +2680,7 @@ void __sched io_schedule(void) atomic_dec(&rq->nr_iowait); } -EXPORT_SYMBOL(io_schedule); - -long __sched io_schedule_timeout(long timeout) +static long __sched ingo_io_schedule_timeout(long timeout) { struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); long ret; @@ -3714,51 +2692,6 @@ long __sched io_schedule_timeout(long ti } /** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. - */ -asmlinkage long sys_sched_get_priority_max(int policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. - */ -asmlinkage long sys_sched_get_priority_min(int policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - ret = 0; - } - return ret; -} - -/** * sys_sched_rr_get_interval - return the default timeslice of a process. * @pid: pid of the process. * @interval: userspace pointer to the timeslice value. @@ -3766,8 +2699,8 @@ asmlinkage long sys_sched_get_priority_m * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -asmlinkage -long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +static long +ingo_sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { int retval = -EINVAL; struct timespec t; @@ -3797,112 +2730,14 @@ out_unlock: return retval; } -static inline struct task_struct *eldest_child(struct task_struct *p) -{ - if (list_empty(&p->children)) return NULL; - return list_entry(p->children.next,struct task_struct,sibling); -} - -static inline struct task_struct *older_sibling(struct task_struct *p) -{ - if (p->sibling.prev==&p->parent->children) return NULL; - return list_entry(p->sibling.prev,struct task_struct,sibling); -} - -static inline struct task_struct *younger_sibling(struct task_struct *p) -{ - if (p->sibling.next==&p->parent->children) return NULL; - return list_entry(p->sibling.next,struct task_struct,sibling); -} - -static void show_task(task_t * p) -{ - task_t *relative; - unsigned state; - unsigned long free = 0; - static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; - - printk("%-13.13s ", p->comm); - state = p->state ? __ffs(p->state) + 1 : 0; - if (state < ARRAY_SIZE(stat_nam)) - printk(stat_nam[state]); - else - printk("?"); -#if (BITS_PER_LONG == 32) - if (state == TASK_RUNNING) - printk(" running "); - else - printk(" %08lX ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(" running task "); - else - printk(" %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long * n = (unsigned long *) (p->thread_info+1); - while (!*n) - n++; - free = (unsigned long) n - (unsigned long)(p->thread_info+1); - } -#endif - printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); - if ((relative = eldest_child(p))) - printk("%5d ", relative->pid); - else - printk(" "); - if ((relative = younger_sibling(p))) - printk("%7d", relative->pid); - else - printk(" "); - if ((relative = older_sibling(p))) - printk(" %5d", relative->pid); - else - printk(" "); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); - - if (state != TASK_RUNNING) - show_stack(p, NULL); -} - -void show_state(void) -{ - task_t *g, *p; - -#if (BITS_PER_LONG == 32) - printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); -#else - printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); -#endif - read_lock(&tasklist_lock); - do_each_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: - */ - touch_nmi_watchdog(); - show_task(p); - } while_each_thread(g, p); - - read_unlock(&tasklist_lock); -} - -void __devinit init_idle(task_t *idle, int cpu) +static void __devinit ingo_init_idle(task_t *idle, int cpu) { runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; - idle->array = NULL; - idle->prio = MAX_PRIO; + idle->u.ingosched.sleep_avg = 0; + idle->u.ingosched.array = NULL; + idle->u.ingosched.prio = MAX_PRIO; idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); @@ -3919,15 +2754,6 @@ void __devinit init_idle(task_t *idle, i #endif } -/* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. - */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; - #ifdef CONFIG_SMP /* * This is how migration works: @@ -3954,7 +2780,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(task_t *p, cpumask_t new_mask) +static int ingo_set_cpus_allowed(task_t *p, cpumask_t new_mask) { unsigned long flags; int ret = 0; @@ -3987,8 +2813,6 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); - /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() @@ -4017,15 +2841,16 @@ static void __migrate_task(struct task_s goto out; set_task_cpu(p, dest_cpu); - if (p->array) { + if (p->u.ingosched.array) { /* * Sync timestamp with rq_dest's before activating. * The same thing could be achieved by doing this step * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->u.ingosched.timestamp = p->u.ingosched.timestamp - + rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -4165,7 +2990,7 @@ static void migrate_live_tasks(int src_c * It does so by boosting its priority to highest possible and adding it to * the _front_ of runqueue. Used by CPU offline code. */ -void sched_idle_next(void) +static void ingo_sched_idle_next(void) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); @@ -4223,7 +3048,7 @@ static void migrate_dead_tasks(unsigned while (!list_empty(list)) migrate_dead(dead_cpu, list_entry(list->next, task_t, - run_list)); + u.ingosched.run_list)); } } } @@ -4306,7 +3131,7 @@ static struct notifier_block __devinitda .priority = 10 }; -int __init migration_init(void) +static int __init ingo_migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); /* Start one for boot CPU. */ @@ -4318,11 +3143,38 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP +#ifdef CONFIG_SCHEDSTATS +static void ingo_show_schedstat_sd(struct seq_file *seq, int cpu) +{ + enum idle_type itype; + struct sched_domain *sd; + int dcnt = 0; + + for_each_domain(cpu, sd) { + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu", + sd->lb_cnt[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu\n", + sd->sbe_pushed, sd->sbe_attempts, + sd->ttwu_wake_affine, sd->ttwu_wake_balance); + } +} +#endif + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +static void __devinit ingo_cpu_attach_domain(struct sched_domain *sd, int cpu) { migration_req_t req; unsigned long flags; @@ -4349,9 +3201,6 @@ void __devinit cpu_attach_domain(struct } } -/* cpus with isolated domains */ -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { @@ -4366,52 +3215,6 @@ static int __init isolated_cpu_setup(cha __setup ("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -void __devinit init_sched_build_groups(struct sched_group groups[], - cpumask_t span, int (*group_fn)(int cpu)) -{ - struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; - int i; - - for_each_cpu_mask(i, span) { - int group = group_fn(i); - struct sched_group *sg = &groups[group]; - int j; - - if (cpu_isset(i, covered)) - continue; - - sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; - - for_each_cpu_mask(j, span) { - if (group_fn(j) != group) - continue; - - cpu_set(j, covered); - cpu_set(j, sg->cpumask); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - - #ifdef ARCH_HAS_SCHED_DOMAIN extern void __devinit arch_init_sched_domains(void); extern void __devinit arch_destroy_sched_domains(void); @@ -4740,7 +3543,7 @@ static int update_sched_domains(struct n } #endif -void __init sched_init_smp(void) +static void __init ingo_sched_init_smp(void) { lock_cpu_hotplug(); arch_init_sched_domains(); @@ -4750,25 +3553,21 @@ void __init sched_init_smp(void) hotcpu_notifier(update_sched_domains, 0); } #else -void __init sched_init_smp(void) +static void __init ingo_sched_init_smp(void) { } #endif /* CONFIG_SMP */ -int in_sched_functions(unsigned long addr) -{ - /* Linker adds these: start and end of __sched functions */ - extern char __sched_text_start[], __sched_text_end[]; - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -void __init sched_init(void) +static void __init ingo_sched_init(void) { runqueue_t *rq; int i, j, k; + init_task.u.ingosched.prio = MAX_PRIO - 20; + init_task.static_prio = MAX_PRIO - 20; + INIT_LIST_HEAD(&init_task.u.ingosched.run_list); + init_task.u.ingosched.time_slice = HZ; + for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; @@ -4797,6 +3596,9 @@ void __init sched_init(void) // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } +#ifdef CONFIG_SCHEDSTATS + rq->sspcd = cpu_sspcd(i); +#endif } /* @@ -4814,28 +3616,6 @@ void __init sched_init(void) init_idle(current, smp_processor_id()); } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) -{ -#if defined(in_atomic) - static unsigned long prev_jiffy; /* ratelimiting */ - - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "Debug: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - dump_stack(); - } -#endif -} -EXPORT_SYMBOL(__might_sleep); -#endif - #if defined(CONFIG_DEBUG_KERNEL)&&defined(CONFIG_SYSCTL)&&defined(CONFIG_SMP) static struct ctl_table sd_ctl_dir[] = { {1, "sched_domain", NULL, 0, 0755, NULL, }, @@ -4925,7 +3705,7 @@ static ctl_table *sd_alloc_ctl_cpu_table } static struct ctl_table_header *sd_sysctl_header; -void init_sched_domain_sysctl() +void ingo_init_sched_domain_sysctl(void) { int i, cpu_num = num_online_cpus(); char buf[32]; @@ -4943,7 +3723,7 @@ void init_sched_domain_sysctl() sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0); } -void destroy_sched_domain_sysctl() +static void ingo_destroy_sched_domain_sysctl(void) { int cpu, cpu_num = num_online_cpus(); struct sched_domain *sd; @@ -4965,16 +3745,21 @@ void destroy_sched_domain_sysctl() kfree(root); } #else -void init_sched_domain_sysctl() +static void ingo_init_sched_domain_sysctl(void) { } -void destroy_sched_domain_sysctl() +static void ingo_destroy_sched_domain_sysctl(void) { } #endif +static int ingo_is_idle_task(const task_t *p) +{ + return p == task_rq(p)->idle; +} + #ifdef CONFIG_MAGIC_SYSRQ -void normalize_rt_tasks(void) +void ingo_normalize_rt_tasks(void) { struct task_struct *p; prio_array_t *array; @@ -4988,7 +3773,7 @@ void normalize_rt_tasks(void) rq = task_rq_lock(p, &flags); - array = p->array; + array = p->u.ingosched.array; if (array) deactivate_task(p, task_rq(p)); __setscheduler(p, SCHED_NORMAL, 0); @@ -5003,3 +3788,59 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +struct sched_drv ingo_sched_drv = { + .task_cpu = common_task_cpu, + .set_task_cpu = common_set_task_cpu, + .init_sched_domain_sysctl = ingo_init_sched_domain_sysctl, + .destroy_sched_domain_sysctl = ingo_destroy_sched_domain_sysctl, + .cpusched_name = "ingosched", + .rt_task = ingo_rt_task, + .wait_for_completion = ingo_wait_for_completion, + .io_schedule = ingo_io_schedule, + .io_schedule_timeout = ingo_io_schedule_timeout, + .set_oom_timeslice = ingo_set_oom_timeslice, + .nr_running = ingo_nr_running, + .nr_uninterruptible = ingo_nr_uninterruptible, + .nr_context_switches = ingo_nr_context_switches, + .nr_iowait = ingo_nr_iowait, + .nr_iowait_task_cpu = ingo_nr_iowait_task_cpu, + .idle_cpu = ingo_idle_cpu, + .init_idle = ingo_init_idle, + .exit = ingo_sched_exit, + .fork = ingo_sched_fork, + .init = ingo_sched_init, + .init_smp = ingo_sched_init_smp, + .schedule = ingo_schedule, + .tick = ingo_scheduler_tick, + .tail = ingo_schedule_tail, + .setscheduler = ingo_setscheduler, + .set_user_nice = ingo_set_user_nice, + .rr_get_interval = ingo_sys_sched_rr_get_interval, + .yield = ingo_sys_sched_yield, + .is_idle_task = ingo_is_idle_task, + .task_curr = ingo_task_curr, + .task_nice = ingo_task_nice, + .task_prio = ingo_task_prio, + .try_to_wake_up = ingo_try_to_wake_up, + .wake_up_new_task = ingo_wake_up_new_task, +#ifdef CONFIG_SMP + .migration_init = ingo_migration_init, + .exec = ingo_sched_exec, + .set_cpus_allowed = ingo_set_cpus_allowed, + .wait_task_inactive = ingo_wait_task_inactive, + .cpu_attach_domain = ingo_cpu_attach_domain, +#ifdef CONFIG_HOTPLUG_CPU + .sched_idle_next = ingo_sched_idle_next, +#endif +#ifdef CONFIG_SCHEDSTATS + .show_schedstat_sd = ingo_show_schedstat_sd, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_tasks = ingo_normalize_rt_tasks, +#endif +#ifdef CONFIG_KGDB + .kgdb_get_idle = ingo_kgdb_get_idle, +#endif +}; Index: linux-2.6.10-rc1-mm5/kernel/scheduler.c =================================================================== --- linux-2.6.10-rc1-mm5.orig/kernel/scheduler.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.10-rc1-mm5/kernel/scheduler.c 2004-11-11 22:08:35.000000000 +1100 @@ -0,0 +1,1552 @@ +/* + * kernel/scheduler.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * Modular cpu scheduler infrastructure by Con Kolivas based on + * work by William Lee Irwin III. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DEFINE_PER_CPU(struct kernel_stat, kstat); +EXPORT_PER_CPU_SYMBOL(kstat); + +unsigned long nr_iowait_task_cpu(const task_t *p) +{ + return scheduler->nr_iowait_task_cpu(p); +} + +/* + * Do the virtual cpu time signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + */ +static inline void account_it_virt(struct task_struct * p, cputime_t cputime) +{ + cputime_t it_virt = p->it_virt_value; + + if (cputime_gt(it_virt, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_virt)) { + it_virt = cputime_add(it_virt, p->it_virt_incr); + send_sig(SIGVTALRM, p, 1); + } + it_virt = cputime_sub(it_virt, cputime); + p->it_virt_value = it_virt; + } +} + +/* + * Do the virtual profiling signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void account_it_prof(struct task_struct *p, cputime_t cputime) +{ + cputime_t it_prof = p->it_prof_value; + + if (cputime_gt(it_prof, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_prof)) { + it_prof = cputime_add(it_prof, p->it_prof_incr); + send_sig(SIGPROF, p, 1); + } + it_prof = cputime_sub(it_prof, cputime); + p->it_prof_value = it_prof; + } +} + +/* + * Check if the process went over its cputime resource limit after + * some cpu time got added to utime/stime. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void check_rlimit(struct task_struct *p, cputime_t cputime) +{ + cputime_t total, tmp; + + total = cputime_add(p->utime, p->stime); + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_cur); + if (unlikely(cputime_gt(total, tmp))) { + /* Send SIGXCPU every second. */ + tmp = cputime_sub(total, cputime); + if (cputime_to_secs(tmp) < cputime_to_secs(total)) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max); + if (cputime_gt(total, tmp)) + send_sig(SIGKILL, p, 1); + } +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in user space since the last update + */ +void account_user_time(struct task_struct *p, cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->utime = cputime_add(p->utime, cputime); + + /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */ + if (likely(p->signal)) + check_rlimit(p, cputime); + account_it_virt(p, cputime); + account_it_prof(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (task_nice(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); + + /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */ + if (likely(p->signal)) + check_rlimit(p, cputime); + account_it_prof(p, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (!is_idle_task(p)) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (nr_iowait_task_cpu(p) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); +} + +/* + * Account for involuntary wait time. + * @p: the process from which the cpu time has been stolen + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(struct task_struct *p, cputime_t steal) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t steal64 = cputime_to_cputime64(steal); + + if (is_idle_task(p)) + cpustat->system = cputime64_add(cpustat->system, steal64); + else + cpustat->steal = cputime64_add(cpustat->steal, steal64); +} + +unsigned int task_cpu(const struct task_struct *p); + +void set_task_cpu(struct task_struct *p, unsigned int cpu); + +#ifdef CONFIG_SMP +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + */ +void kick_process(task_t *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Wrappers for p->thread_info->cpu access. No-op on UP. + */ +unsigned int common_task_cpu(const struct task_struct *p) +{ + return p->thread_info->cpu; +} + +void common_set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->thread_info->cpu = cpu; +} + +#else + +unsigned int common_task_cpu(const struct task_struct *p) +{ + return 0; +} + +void common_set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PREEMPT +#ifdef CONFIG_DEBUG_PREEMPT + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + schedule(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void fastcall __sched sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state =