From c206fa5c4e6f7d2c1ac63a9816cae23f9df04564 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Thu, 20 Oct 2016 15:34:43 +1100 Subject: [PATCH 60/80] 4.8-sched-bfs-512.patch --- Documentation/sysctl/kernel.txt | 26 +++++++++++ arch/powerpc/platforms/cell/spufs/sched.c | 5 --- arch/x86/Kconfig | 22 +++++++-- drivers/cpufreq/cpufreq_conservative.c | 4 +- drivers/cpufreq/cpufreq_ondemand.c | 4 +- fs/proc/base.c | 2 +- include/linux/init_task.h | 75 +++++++++++++++++++++++++++++-- include/linux/ioprio.h | 2 + include/linux/jiffies.h | 2 +- include/linux/sched.h | 68 ++++++++++++++++++++++++---- include/linux/sched/prio.h | 12 +++++ include/uapi/linux/sched.h | 9 +++- init/Kconfig | 29 ++++++++++-- init/main.c | 3 +- kernel/Makefile | 2 +- kernel/delayacct.c | 2 +- kernel/exit.c | 2 +- kernel/sched/Makefile | 11 +++-- kernel/sched/cpufreq.c | 4 ++ kernel/sched/cpufreq_schedutil.c | 4 ++ kernel/sched/idle.c | 4 ++ kernel/sched/stats.c | 4 ++ kernel/smpboot.c | 2 +- kernel/sysctl.c | 42 +++++++++++++++-- kernel/time/Kconfig | 2 +- kernel/time/posix-cpu-timers.c | 10 ++--- kernel/trace/trace_selftest.c | 5 +++ lib/Kconfig.debug | 2 +- 28 files changed, 312 insertions(+), 47 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index ffab8b5..f54ade2 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -39,6 +39,7 @@ show up in /proc/sys/kernel: - hung_task_timeout_secs - hung_task_warnings - kexec_load_disabled +- iso_cpu - kptr_restrict - kstack_depth_to_print [ X86 only ] - l2cr [ PPC only ] @@ -73,6 +74,7 @@ show up in /proc/sys/kernel: - randomize_va_space - real-root-dev ==> Documentation/initrd.txt - reboot-cmd [ SPARC only ] +- rr_interval - rtsig-max - rtsig-nr - sem @@ -402,6 +404,16 @@ kernel stack. ============================================================== +iso_cpu: (BFS CPU scheduler only). + +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can +run effectively at realtime priority, averaged over a rolling five +seconds over the -whole- system, meaning all cpus. + +Set to 70 (percent) by default. + +============================================================== + l2cr: (PPC only) This flag controls the L2 cache of G3 processor boards. If @@ -818,6 +830,20 @@ rebooting. ??? ============================================================== +rr_interval: (BFS CPU scheduler only) + +This is the smallest duration that any cpu process scheduling unit +will run for. Increasing this value can increase throughput of cpu +bound tasks substantially but at the expense of increased latencies +overall. Conversely decreasing it will decrease average and maximum +latencies but at the expense of throughput. This value is in +milliseconds and the default value chosen depends on the number of +cpus available at scheduler initialisation with a minimum of 6. + +Valid values are from 1-1000. + +============================================================== + rtsig-max & rtsig-nr: The file rtsig-max can be used to tune the maximum number diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 460f5f3..eeb3e32 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -64,11 +64,6 @@ static struct timer_list spusched_timer; static struct timer_list spuloadavg_timer; /* - * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). - */ -#define NORMAL_PRIO 120 - -/* * Frequency of the spu scheduler tick. By default we do one SPU scheduler * tick for every 10 CPU scheduler ticks. */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2a1f0ce..4d25df5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -914,10 +914,26 @@ config SCHED_SMT depends on SMP ---help--- SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a + when dealing with Intel P4/Core 2 chips with HyperThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config SMT_NICE + bool "SMT (Hyperthreading) aware nice priority and policy support" + depends on SCHED_BFS && SCHED_SMT + default y + ---help--- + Enabling Hyperthreading on Intel CPUs decreases the effectiveness + of the use of 'nice' levels and different scheduling policies + (e.g. realtime) due to sharing of CPU power between hyperthreads. + SMT nice support makes each logical CPU aware of what is running on + its hyperthread siblings, maintaining appropriate distribution of + CPU according to nice levels and scheduling policies at the expense + of slightly increased overhead. + + If unsure say Y here. + + config SCHED_MC def_bool y prompt "Multi-core scheduler support" @@ -2036,7 +2052,7 @@ config HOTPLUG_CPU config BOOTPARAM_HOTPLUG_CPU0 bool "Set default setting of cpu0_hotpluggable" default n - depends on HOTPLUG_CPU + depends on HOTPLUG_CPU && !SCHED_BFS ---help--- Set whether default state of cpu0_hotpluggable is on or off. @@ -2065,7 +2081,7 @@ config BOOTPARAM_HOTPLUG_CPU0 config DEBUG_HOTPLUG_CPU0 def_bool n prompt "Debug CPU0 hotplug" - depends on HOTPLUG_CPU + depends on HOTPLUG_CPU && !SCHED_BFS ---help--- Enabling this option offlines CPU0 (if CPU0 can be offlined) as soon as possible and boots up userspace with CPU0 offlined. User diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 18da4f8..460efa2 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -30,8 +30,8 @@ struct cs_dbs_tuners { }; /* Conservative governor macros */ -#define DEF_FREQUENCY_UP_THRESHOLD (80) -#define DEF_FREQUENCY_DOWN_THRESHOLD (20) +#define DEF_FREQUENCY_UP_THRESHOLD (63) +#define DEF_FREQUENCY_DOWN_THRESHOLD (26) #define DEF_FREQUENCY_STEP (5) #define DEF_SAMPLING_DOWN_FACTOR (1) #define MAX_SAMPLING_DOWN_FACTOR (10) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 3a1f49f..7b3187a 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -20,7 +20,7 @@ #include "cpufreq_ondemand.h" /* On-demand governor macros */ -#define DEF_FREQUENCY_UP_THRESHOLD (80) +#define DEF_FREQUENCY_UP_THRESHOLD (63) #define DEF_SAMPLING_DOWN_FACTOR (1) #define MAX_SAMPLING_DOWN_FACTOR (100000) #define MICRO_FREQUENCY_UP_THRESHOLD (95) @@ -129,7 +129,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) } /* - * Every sampling_rate, we check, if current idle time is less than 20% + * Every sampling_rate, we check, if current idle time is less than 37% * (default), then we try to increase frequency. Else, we adjust the frequency * proportional to load. */ diff --git a/fs/proc/base.c b/fs/proc/base.c index ac0df4d..0ac4c28 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -505,7 +505,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", - (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)tsk_seruntime(task), (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f8834f8..2f40a51 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -157,8 +157,6 @@ extern struct task_group root_task_group; # define INIT_VTIME(tsk) #endif -#define INIT_TASK_COMM "swapper" - #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ .pi_waiters = RB_ROOT, \ @@ -187,6 +185,77 @@ extern struct task_group root_task_group; * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ +#ifdef CONFIG_SCHED_BFS +#define INIT_TASK_COMM "BFS" +#define INIT_TASK(tsk) \ +{ \ + .state = 0, \ + .stack = &init_thread_info, \ + .usage = ATOMIC_INIT(2), \ + .flags = PF_KTHREAD, \ + .prio = NORMAL_PRIO, \ + .static_prio = MAX_PRIO-20, \ + .normal_prio = NORMAL_PRIO, \ + .deadline = 0, \ + .policy = SCHED_NORMAL, \ + .cpus_allowed = CPU_MASK_ALL, \ + .mm = NULL, \ + .active_mm = &init_mm, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ + .time_slice = 1000000, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ + INIT_PUSHABLE_TASKS(tsk) \ + .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ + .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ + .real_parent = &tsk, \ + .parent = &tsk, \ + .children = LIST_HEAD_INIT(tsk.children), \ + .sibling = LIST_HEAD_INIT(tsk.sibling), \ + .group_leader = &tsk, \ + RCU_POINTER_INITIALIZER(real_cred, &init_cred), \ + RCU_POINTER_INITIALIZER(cred, &init_cred), \ + .comm = INIT_TASK_COMM, \ + .thread = INIT_THREAD, \ + .fs = &init_fs, \ + .files = &init_files, \ + .signal = &init_signals, \ + .sighand = &init_sighand, \ + .nsproxy = &init_nsproxy, \ + .pending = { \ + .list = LIST_HEAD_INIT(tsk.pending.list), \ + .signal = {{0}}}, \ + .blocked = {{0}}, \ + .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ + .journal_info = NULL, \ + .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .timer_slack_ns = 50000, /* 50 usec default slack */ \ + .pids = { \ + [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ + [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ + [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ + }, \ + .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ + INIT_IDS \ + INIT_PERF_EVENTS(tsk) \ + INIT_TRACE_IRQFLAGS \ + INIT_LOCKDEP \ + INIT_FTRACE_GRAPH \ + INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_TASK_RCU_TASKS(tsk) \ + INIT_CPUSET_SEQ(tsk) \ + INIT_RT_MUTEXES(tsk) \ + INIT_PREV_CPUTIME(tsk) \ + INIT_VTIME(tsk) \ + INIT_NUMA_BALANCING(tsk) \ + INIT_KASAN(tsk) \ +} +#else /* CONFIG_SCHED_BFS */ +#define INIT_TASK_COMM "swapper" #define INIT_TASK(tsk) \ { \ .state = 0, \ @@ -261,7 +330,7 @@ extern struct task_group root_task_group; INIT_NUMA_BALANCING(tsk) \ INIT_KASAN(tsk) \ } - +#endif /* CONFIG_SCHED_BFS */ #define INIT_CPU_TIMERS(cpu_timers) \ { \ diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index beb9ce1..ce2fc3c 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -52,6 +52,8 @@ enum { */ static inline int task_nice_ioprio(struct task_struct *task) { + if (iso_task(task)) + return 0; return (task_nice(task) + 20) / 5; } diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5fdc553..6e54b8e 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void) * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ -#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) +#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) /* * Change timeval to jiffies, trying to avoid the diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e5..e57e2c1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -59,6 +59,7 @@ struct sched_param { #include #include #include +#include #include @@ -176,7 +177,7 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_BFS) extern void cpu_load_update_nohz_start(void); extern void cpu_load_update_nohz_stop(void); #else @@ -340,8 +341,6 @@ extern void init_idle_bootup_task(struct task_struct *idle); extern cpumask_var_t cpu_isolated_map; -extern int runqueue_is_locked(int cpu); - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); @@ -1464,9 +1463,11 @@ struct task_struct { unsigned int flags; /* per process flags, defined below */ unsigned int ptrace; +#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS) + int on_cpu; +#endif #ifdef CONFIG_SMP struct llist_node wake_entry; - int on_cpu; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; @@ -1474,12 +1475,26 @@ struct task_struct { int wake_cpu; #endif int on_rq; - int prio, static_prio, normal_prio; unsigned int rt_priority; +#ifdef CONFIG_SCHED_BFS + int time_slice; + u64 deadline; + skiplist_node node; /* Skip list node */ + u64 last_ran; + u64 sched_time; /* sched_clock time spent running */ +#ifdef CONFIG_SMT_NICE + int smt_bias; /* Policy/nice level bias across smt siblings */ +#endif +#ifdef CONFIG_HOTPLUG_CPU + bool zerobound; /* Bound to CPU0 for hotplug */ +#endif + unsigned long rt_timeout; +#else /* CONFIG_SCHED_BFS */ const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif @@ -1603,6 +1618,9 @@ struct task_struct { int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled; +#ifdef CONFIG_SCHED_BFS + unsigned long utime_pc, stime_pc; +#endif cputime_t gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -1939,6 +1957,40 @@ extern int arch_task_struct_size __read_mostly; # define arch_task_struct_size (sizeof(struct task_struct)) #endif +#ifdef CONFIG_SCHED_BFS +#define tsk_seruntime(t) ((t)->sched_time) +#define tsk_rttimeout(t) ((t)->rt_timeout) + +static inline void tsk_cpus_current(struct task_struct *p) +{ +} + +void print_scheduler_version(void); + +static inline bool iso_task(struct task_struct *p) +{ + return (p->policy == SCHED_ISO); +} +#else /* CFS */ +#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) +#define tsk_rttimeout(t) ((t)->rt.timeout) + +static inline void tsk_cpus_current(struct task_struct *p) +{ + p->nr_cpus_allowed = current->nr_cpus_allowed; +} + +static inline void print_scheduler_version(void) +{ + printk(KERN_INFO"CFS CPU scheduler.\n"); +} + +static inline bool iso_task(struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_SCHED_BFS */ + /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) @@ -2392,7 +2444,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, } #endif -#ifdef CONFIG_NO_HZ_COMMON +#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_BFS) void calc_load_enter_idle(void); void calc_load_exit_idle(void); #else @@ -2493,7 +2545,7 @@ extern unsigned long long task_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS) extern void sched_exec(void); #else #define sched_exec() {} @@ -3381,7 +3433,7 @@ static inline unsigned int task_cpu(const struct task_struct *p) return 0; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +static inline void set_task_cpu(struct task_struct *p, int cpu) { } diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index d9cf5a5..7d5d0b8 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -19,8 +19,20 @@ */ #define MAX_USER_RT_PRIO 100 + +#ifdef CONFIG_SCHED_BFS +/* Note different MAX_RT_PRIO */ +#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) + +#define ISO_PRIO (MAX_RT_PRIO) +#define NORMAL_PRIO (MAX_RT_PRIO + 1) +#define IDLE_PRIO (MAX_RT_PRIO + 2) +#define PRIO_LIMIT ((IDLE_PRIO) + 1) +#else /* CONFIG_SCHED_BFS */ #define MAX_RT_PRIO MAX_USER_RT_PRIO +#endif /* CONFIG_SCHED_BFS */ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5f0fe01..f170d61 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -36,9 +36,16 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 -/* SCHED_ISO: reserved but not implemented yet */ +/* SCHED_ISO: Implemented on BFS only */ #define SCHED_IDLE 5 +#ifdef CONFIG_SCHED_BFS +#define SCHED_ISO 4 +#define SCHED_IDLEPRIO SCHED_IDLE +#define SCHED_MAX (SCHED_IDLEPRIO) +#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) +#else /* CONFIG_SCHED_BFS */ #define SCHED_DEADLINE 6 +#endif /* CONFIG_SCHED_BFS */ /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 diff --git a/init/Kconfig b/init/Kconfig index cac3f09..44efa80 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -28,6 +28,20 @@ config BUILDTIME_EXTABLE_SORT menu "General setup" +config SCHED_BFS + bool "BFS cpu scheduler" + ---help--- + The Brain Fuck CPU Scheduler for excellent interactivity and + responsiveness on the desktop and solid scalability on normal + hardware and commodity servers. Not recommended for 4096 CPUs. + + Currently incompatible with the Group CPU scheduler, and RCU TORTURE + TEST so these options are disabled. + + Say Y here. + default y + + config BROKEN bool @@ -338,7 +352,7 @@ choice # Kind of a stub config for the pure tick based cputime accounting config TICK_CPU_ACCOUNTING bool "Simple tick based cputime accounting" - depends on !S390 && !NO_HZ_FULL + depends on !S390 && !NO_HZ_FULL && !SCHED_BFS help This is the basic tick based cputime accounting that maintains statistics about user, system and idle time spent on per jiffies @@ -363,6 +377,7 @@ config VIRT_CPU_ACCOUNTING_GEN bool "Full dynticks CPU time accounting" depends on HAVE_CONTEXT_TRACKING depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + depends on !SCHED_BFS select VIRT_CPU_ACCOUNTING select CONTEXT_TRACKING help @@ -698,6 +713,7 @@ config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU || PREEMPT_RCU depends on RCU_EXPERT || NO_HZ_FULL + depends on !SCHED_BFS default n help Use this option to reduce OS jitter for aggressive HPC or @@ -930,6 +946,7 @@ config NUMA_BALANCING depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY depends on SMP && NUMA && MIGRATION + depends on !SCHED_BFS help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when @@ -1032,9 +1049,13 @@ menuconfig CGROUP_SCHED help This feature lets CPU scheduler recognize task groups and control CPU bandwidth allocation to such task groups. It uses cgroups to group - tasks. + tasks. In combination with BFS this is purely a STUB to create the + files associated with the CPU controller cgroup but most of the + controls do nothing. This is useful for working in environments and + with applications that will only work if this control group is + present. -if CGROUP_SCHED +if CGROUP_SCHED && !SCHED_BFS config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED @@ -1130,6 +1151,7 @@ config CGROUP_DEVICE config CGROUP_CPUACCT bool "Simple CPU accounting controller" + depends on !SCHED_BFS help Provides a simple controller for monitoring the total CPU consumed by the tasks in a cgroup. @@ -1228,6 +1250,7 @@ endif # NAMESPACES config SCHED_AUTOGROUP bool "Automatic process group scheduling" + depends on !SCHED_BFS select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED diff --git a/init/main.c b/init/main.c index a8a58e2..76a2d14 100644 --- a/init/main.c +++ b/init/main.c @@ -792,7 +792,6 @@ int __init_or_module do_one_initcall(initcall_t fn) return ret; } - extern initcall_t __initcall_start[]; extern initcall_t __initcall0_start[]; extern initcall_t __initcall1_start[]; @@ -951,6 +950,8 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); + print_scheduler_version(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e..dd84575 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o + async.o range.o smpboot.o skip_lists.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 435c14a..a80d56dc 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) */ t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; - t3 = tsk->se.sum_exec_runtime; + t3 = tsk_seruntime(tsk); d->cpu_count += t1; diff --git a/kernel/exit.c b/kernel/exit.c index 091a78b..c22b37f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -134,7 +134,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; + sig->sum_sched_runtime += tsk_seruntime(tsk); sig->nr_threads--; __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5e59b83..8c1204f 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,13 +15,18 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif +ifdef CONFIG_SCHED_BFS +obj-y += bfs.o clock.o +else obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SMP) += cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o -obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +endif +obj-y += wait.o swait.o completion.o idle.o +obj-$(CONFIG_SMP) += cpupri.o +obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 1141954..2ebd7b0 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -9,7 +9,11 @@ * published by the Free Software Foundation. */ +#ifdef CONFIG_SCHED_BFS +#include "bfs_sched.h" +#else #include "sched.h" +#endif DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a84641b..eba226d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -16,7 +16,11 @@ #include #include +#ifdef CONFIG_SCHED_BFS +#include "bfs_sched.h" +#else #include "sched.h" +#endif struct sugov_tunables { struct gov_attr_set attr_set; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873c..1e855dc 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -14,7 +14,11 @@ #include +#ifdef CONFIG_SCHED_BFS +#include "bfs_sched.h" +#else #include "sched.h" +#endif /** * sched_idle_set_state - Record idle state for the current CPU. diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 87e2c9f..7466a0b 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -4,7 +4,11 @@ #include #include +#ifndef CONFIG_SCHED_BFS #include "sched.h" +#else +#include "bfs_sched.h" +#endif /* * bump this up when changing the output format or the meaning of an existing diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d..fc0d8270 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data) if (kthread_should_park()) { __set_current_state(TASK_RUNNING); - preempt_enable(); if (ht->park && td->status == HP_THREAD_ACTIVE) { BUG_ON(td->cpu != smp_processor_id()); ht->park(td->cpu); td->status = HP_THREAD_PARKED; } + preempt_enable(); kthread_parkme(); /* We might have been woken for stop */ continue; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a13bbda..ca8093e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,8 +125,13 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; -static int one_hundred = 100; -static int one_thousand = 1000; +static int __read_mostly one_hundred = 100; +static int __read_mostly one_thousand = 1000; +#ifdef CONFIG_SCHED_BFS +extern int rr_interval; +extern int sched_interactive; +extern int sched_iso_cpu; +#endif #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -264,7 +269,7 @@ static struct ctl_table sysctl_base_table[] = { { } }; -#ifdef CONFIG_SCHED_DEBUG +#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS) static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ @@ -281,6 +286,7 @@ static int max_extfrag_threshold = 1000; #endif static struct ctl_table kern_table[] = { +#ifndef CONFIG_SCHED_BFS { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, @@ -449,6 +455,7 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#endif /* !CONFIG_SCHED_BFS */ #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -1013,6 +1020,35 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SCHED_BFS + { + .procname = "rr_interval", + .data = &rr_interval, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one_thousand, + }, + { + .procname = "interactive", + .data = &sched_interactive, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "iso_cpu", + .data = &sched_iso_cpu, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { .procname = "spin_retry", diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4008d9f..6931b6e 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -89,7 +89,7 @@ config NO_HZ_IDLE config NO_HZ_FULL bool "Full dynticks system (tickless)" # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS # We need at least one periodic CPU for timekeeping depends on SMP depends on HAVE_CONTEXT_TRACKING diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 39008d7..784f3a1 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -447,7 +447,7 @@ static void cleanup_timers(struct list_head *head) */ void posix_cpu_timers_exit(struct task_struct *tsk) { - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + add_device_randomness((const void*) &tsk_seruntime(tsk), sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers); @@ -848,7 +848,7 @@ static void check_thread_timers(struct task_struct *tsk, tsk_expires->virt_exp = expires_to_cputime(expires); tsk_expires->sched_exp = check_timers_list(++timers, firing, - tsk->se.sum_exec_runtime); + tsk_seruntime(tsk)); /* * Check for the special case thread timers. @@ -859,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk, READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -867,7 +867,7 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { /* * At the soft limit, send a SIGXCPU every second. */ @@ -1115,7 +1115,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) struct task_cputime task_sample; task_cputime(tsk, &task_sample.utime, &task_sample.stime); - task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; + task_sample.sum_exec_runtime = tsk_seruntime(tsk); if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) return 1; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b0f86ea..287cf72 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1039,10 +1039,15 @@ static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ static const struct sched_attr attr = { +#ifdef CONFIG_SCHED_BFS + /* No deadline on BFS, use RR */ + .sched_policy = SCHED_RR, +#else .sched_policy = SCHED_DEADLINE, .sched_runtime = 100000ULL, .sched_deadline = 10000000ULL, .sched_period = 10000000ULL +#endif }; struct wakeup_test_data *x = data; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cab7405..4f7faee 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1323,7 +1323,7 @@ config RCU_PERF_TEST config RCU_TORTURE_TEST tristate "torture tests for RCU" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !SCHED_BFS select TORTURE_TEST select SRCU select TASKS_RCU -- 2.7.4