From c206fa5c4e6f7d2c1ac63a9816cae23f9df04564 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Thu, 20 Oct 2016 15:34:43 +1100
Subject: [PATCH 60/80] 4.8-sched-bfs-512.patch

---
 Documentation/sysctl/kernel.txt           | 26 +++++++++++
 arch/powerpc/platforms/cell/spufs/sched.c |  5 ---
 arch/x86/Kconfig                          | 22 +++++++--
 drivers/cpufreq/cpufreq_conservative.c    |  4 +-
 drivers/cpufreq/cpufreq_ondemand.c        |  4 +-
 fs/proc/base.c                            |  2 +-
 include/linux/init_task.h                 | 75 +++++++++++++++++++++++++++++--
 include/linux/ioprio.h                    |  2 +
 include/linux/jiffies.h                   |  2 +-
 include/linux/sched.h                     | 68 ++++++++++++++++++++++++----
 include/linux/sched/prio.h                | 12 +++++
 include/uapi/linux/sched.h                |  9 +++-
 init/Kconfig                              | 29 ++++++++++--
 init/main.c                               |  3 +-
 kernel/Makefile                           |  2 +-
 kernel/delayacct.c                        |  2 +-
 kernel/exit.c                             |  2 +-
 kernel/sched/Makefile                     | 11 +++--
 kernel/sched/cpufreq.c                    |  4 ++
 kernel/sched/cpufreq_schedutil.c          |  4 ++
 kernel/sched/idle.c                       |  4 ++
 kernel/sched/stats.c                      |  4 ++
 kernel/smpboot.c                          |  2 +-
 kernel/sysctl.c                           | 42 +++++++++++++++--
 kernel/time/Kconfig                       |  2 +-
 kernel/time/posix-cpu-timers.c            | 10 ++---
 kernel/trace/trace_selftest.c             |  5 +++
 lib/Kconfig.debug                         |  2 +-
 28 files changed, 312 insertions(+), 47 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ffab8b5..f54ade2 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -39,6 +39,7 @@ show up in /proc/sys/kernel:
 - hung_task_timeout_secs
 - hung_task_warnings
 - kexec_load_disabled
+- iso_cpu
 - kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
@@ -73,6 +74,7 @@ show up in /proc/sys/kernel:
 - randomize_va_space
 - real-root-dev               ==> Documentation/initrd.txt
 - reboot-cmd                  [ SPARC only ]
+- rr_interval
 - rtsig-max
 - rtsig-nr
 - sem
@@ -402,6 +404,16 @@ kernel stack.
 
 ==============================================================
 
+iso_cpu: (BFS CPU scheduler only).
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling five
+seconds over the -whole- system, meaning all cpus.
+
+Set to 70 (percent) by default.
+
+==============================================================
+
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If
@@ -818,6 +830,20 @@ rebooting. ???
 
 ==============================================================
 
+rr_interval: (BFS CPU scheduler only)
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. Conversely decreasing it will decrease average and maximum
+latencies but at the expense of throughput. This value is in
+milliseconds and the default value chosen depends on the number of
+cpus available at scheduler initialisation with a minimum of 6.
+
+Valid values are from 1-1000.
+
+==============================================================
+
 rtsig-max & rtsig-nr:
 
 The file rtsig-max can be used to tune the maximum number
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 460f5f3..eeb3e32 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 
 /*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO		120
-
-/*
  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  * tick for every 10 CPU scheduler ticks.
  */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2a1f0ce..4d25df5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -914,10 +914,26 @@ config SCHED_SMT
 	depends on SMP
 	---help---
 	  SMT scheduler support improves the CPU scheduler's decision making
-	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  when dealing with Intel P4/Core 2 chips with HyperThreading at a
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SMT_NICE
+	bool "SMT (Hyperthreading) aware nice priority and policy support"
+	depends on SCHED_BFS && SCHED_SMT
+	default y
+	---help---
+	  Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+	  of the use of 'nice' levels and different scheduling policies
+	  (e.g. realtime) due to sharing of CPU power between hyperthreads.
+	  SMT nice support makes each logical CPU aware of what is running on
+	  its hyperthread siblings, maintaining appropriate distribution of
+	  CPU according to nice levels and scheduling policies at the expense
+	  of slightly increased overhead.
+
+	  If unsure say Y here.
+
+
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
@@ -2036,7 +2052,7 @@ config HOTPLUG_CPU
 config BOOTPARAM_HOTPLUG_CPU0
 	bool "Set default setting of cpu0_hotpluggable"
 	default n
-	depends on HOTPLUG_CPU
+	depends on HOTPLUG_CPU && !SCHED_BFS
 	---help---
 	  Set whether default state of cpu0_hotpluggable is on or off.
 
@@ -2065,7 +2081,7 @@ config BOOTPARAM_HOTPLUG_CPU0
 config DEBUG_HOTPLUG_CPU0
 	def_bool n
 	prompt "Debug CPU0 hotplug"
-	depends on HOTPLUG_CPU
+	depends on HOTPLUG_CPU && !SCHED_BFS
 	---help---
 	  Enabling this option offlines CPU0 (if CPU0 can be offlined) as
 	  soon as possible and boots up userspace with CPU0 offlined. User
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 18da4f8..460efa2 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -30,8 +30,8 @@ struct cs_dbs_tuners {
 };
 
 /* Conservative governor macros */
-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
-#define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
+#define DEF_FREQUENCY_DOWN_THRESHOLD		(26)
 #define DEF_FREQUENCY_STEP			(5)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(10)
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 3a1f49f..7b3187a 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -20,7 +20,7 @@
 #include "cpufreq_ondemand.h"
 
 /* On-demand governor macros */
-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
@@ -129,7 +129,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
 }
 
 /*
- * Every sampling_rate, we check, if current idle time is less than 20%
+ * Every sampling_rate, we check, if current idle time is less than 37%
  * (default), then we try to increase frequency. Else, we adjust the frequency
  * proportional to load.
  */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ac0df4d..0ac4c28 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -505,7 +505,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 		seq_printf(m, "0 0 0\n");
 	else
 		seq_printf(m, "%llu %llu %lu\n",
-		   (unsigned long long)task->se.sum_exec_runtime,
+		   (unsigned long long)tsk_seruntime(task),
 		   (unsigned long long)task->sched_info.run_delay,
 		   task->sched_info.pcount);
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f8834f8..2f40a51 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,8 +157,6 @@ extern struct task_group root_task_group;
 # define INIT_VTIME(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
-
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
@@ -187,6 +185,77 @@ extern struct task_group root_task_group;
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
+#ifdef CONFIG_SCHED_BFS
+#define INIT_TASK_COMM "BFS"
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.stack		= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= PF_KTHREAD,					\
+	.prio		= NORMAL_PRIO,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= NORMAL_PRIO,					\
+	.deadline	= 0,						\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.restart_block = {						\
+		.fn = do_no_restart_syscall,				\
+	},								\
+	.time_slice	= 1000000,					\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	INIT_PUSHABLE_TASKS(tsk)					\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
+	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
+	.comm		= INIT_TASK_COMM,				\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.nsproxy	= &init_nsproxy,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.pids = {							\
+		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
+		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
+		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+	},								\
+	.thread_group	= LIST_HEAD_INIT(tsk.thread_group),		\
+	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),	\
+	INIT_IDS							\
+	INIT_PERF_EVENTS(tsk)						\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
+	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_TASK_RCU_TASKS(tsk)					\
+	INIT_CPUSET_SEQ(tsk)						\
+	INIT_RT_MUTEXES(tsk)						\
+	INIT_PREV_CPUTIME(tsk)						\
+	INIT_VTIME(tsk)							\
+	INIT_NUMA_BALANCING(tsk)					\
+	INIT_KASAN(tsk)							\
+}
+#else /* CONFIG_SCHED_BFS */
+#define INIT_TASK_COMM "swapper"
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
@@ -261,7 +330,7 @@ extern struct task_group root_task_group;
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
 }
-
+#endif /* CONFIG_SCHED_BFS */
 
 #define INIT_CPU_TIMERS(cpu_timers)					\
 {									\
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index beb9ce1..ce2fc3c 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -52,6 +52,8 @@ enum {
  */
 static inline int task_nice_ioprio(struct task_struct *task)
 {
+	if (iso_task(task))
+		return 0;
 	return (task_nice(task) + 20) / 5;
 }
 
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 5fdc553..6e54b8e 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void)
  * Have the 32 bit jiffies value wrap 5 minutes after boot
  * so jiffies wrap bugs show up earlier.
  */
-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
 
 /*
  * Change timeval to jiffies, trying to avoid the
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62c68e5..e57e2c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -59,6 +59,7 @@ struct sched_param {
 #include <linux/gfp.h>
 #include <linux/magic.h>
 #include <linux/cgroup-defs.h>
+#include <linux/skip_lists.h>
 
 #include <asm/processor.h>
 
@@ -176,7 +177,7 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
 extern void calc_global_load(unsigned long ticks);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_BFS)
 extern void cpu_load_update_nohz_start(void);
 extern void cpu_load_update_nohz_stop(void);
 #else
@@ -340,8 +341,6 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern cpumask_var_t cpu_isolated_map;
 
-extern int runqueue_is_locked(int cpu);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
@@ -1464,9 +1463,11 @@ struct task_struct {
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
 
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS)
+	int on_cpu;
+#endif
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
-	int on_cpu;
 	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
 	struct task_struct *last_wakee;
@@ -1474,12 +1475,26 @@ struct task_struct {
 	int wake_cpu;
 #endif
 	int on_rq;
-
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
+#ifdef CONFIG_SCHED_BFS
+	int time_slice;
+	u64 deadline;
+	skiplist_node node; /* Skip list node */
+	u64 last_ran;
+	u64 sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_SMT_NICE
+	int smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+	bool zerobound; /* Bound to CPU0 for hotplug */
+#endif
+	unsigned long rt_timeout;
+#else /* CONFIG_SCHED_BFS */
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#endif
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
@@ -1603,6 +1618,9 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	cputime_t utime, stime, utimescaled, stimescaled;
+#ifdef CONFIG_SCHED_BFS
+	unsigned long utime_pc, stime_pc;
+#endif
 	cputime_t gtime;
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1939,6 +1957,40 @@ extern int arch_task_struct_size __read_mostly;
 # define arch_task_struct_size (sizeof(struct task_struct))
 #endif
 
+#ifdef CONFIG_SCHED_BFS
+#define tsk_seruntime(t)		((t)->sched_time)
+#define tsk_rttimeout(t)		((t)->rt_timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+}
+
+void print_scheduler_version(void);
+
+static inline bool iso_task(struct task_struct *p)
+{
+	return (p->policy == SCHED_ISO);
+}
+#else /* CFS */
+#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t)	((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+	p->nr_cpus_allowed = current->nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+	printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+static inline bool iso_task(struct task_struct *p)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_BFS */
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -2392,7 +2444,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif
 
-#ifdef CONFIG_NO_HZ_COMMON
+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_BFS)
 void calc_load_enter_idle(void);
 void calc_load_exit_idle(void);
 #else
@@ -2493,7 +2545,7 @@ extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS)
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
@@ -3381,7 +3433,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return 0;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void set_task_cpu(struct task_struct *p, int cpu)
 {
 }
 
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index d9cf5a5..7d5d0b8 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -19,8 +19,20 @@
  */
 
 #define MAX_USER_RT_PRIO	100
+
+#ifdef CONFIG_SCHED_BFS
+/* Note different MAX_RT_PRIO */
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO + 1)
+
+#define ISO_PRIO		(MAX_RT_PRIO)
+#define NORMAL_PRIO		(MAX_RT_PRIO + 1)
+#define IDLE_PRIO		(MAX_RT_PRIO + 2)
+#define PRIO_LIMIT		((IDLE_PRIO) + 1)
+#else /* CONFIG_SCHED_BFS */
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
+#endif /* CONFIG_SCHED_BFS */
+
 #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
 
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe01..f170d61 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,9 +36,16 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
-/* SCHED_ISO: reserved but not implemented yet */
+/* SCHED_ISO: Implemented on BFS only */
 #define SCHED_IDLE		5
+#ifdef CONFIG_SCHED_BFS
+#define SCHED_ISO		4
+#define SCHED_IDLEPRIO		SCHED_IDLE
+#define SCHED_MAX		(SCHED_IDLEPRIO)
+#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
+#else /* CONFIG_SCHED_BFS */
 #define SCHED_DEADLINE		6
+#endif /* CONFIG_SCHED_BFS */
 
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
diff --git a/init/Kconfig b/init/Kconfig
index cac3f09..44efa80 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -28,6 +28,20 @@ config BUILDTIME_EXTABLE_SORT
 
 menu "General setup"
 
+config SCHED_BFS
+	bool "BFS cpu scheduler"
+	---help---
+	  The Brain Fuck CPU Scheduler for excellent interactivity and
+	  responsiveness on the desktop and solid scalability on normal
+          hardware and commodity servers. Not recommended for 4096 CPUs.
+
+	  Currently incompatible with the Group CPU scheduler, and RCU TORTURE
+          TEST so these options are disabled.
+
+          Say Y here.
+	default y
+
+
 config BROKEN
 	bool
 
@@ -338,7 +352,7 @@ choice
 # Kind of a stub config for the pure tick based cputime accounting
 config TICK_CPU_ACCOUNTING
 	bool "Simple tick based cputime accounting"
-	depends on !S390 && !NO_HZ_FULL
+	depends on !S390 && !NO_HZ_FULL && !SCHED_BFS
 	help
 	  This is the basic tick based cputime accounting that maintains
 	  statistics about user, system and idle time spent on per jiffies
@@ -363,6 +377,7 @@ config VIRT_CPU_ACCOUNTING_GEN
 	bool "Full dynticks CPU time accounting"
 	depends on HAVE_CONTEXT_TRACKING
 	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
+	depends on !SCHED_BFS
 	select VIRT_CPU_ACCOUNTING
 	select CONTEXT_TRACKING
 	help
@@ -698,6 +713,7 @@ config RCU_NOCB_CPU
 	bool "Offload RCU callback processing from boot-selected CPUs"
 	depends on TREE_RCU || PREEMPT_RCU
 	depends on RCU_EXPERT || NO_HZ_FULL
+	depends on !SCHED_BFS
 	default n
 	help
 	  Use this option to reduce OS jitter for aggressive HPC or
@@ -930,6 +946,7 @@ config NUMA_BALANCING
 	depends on ARCH_SUPPORTS_NUMA_BALANCING
 	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
 	depends on SMP && NUMA && MIGRATION
+	depends on !SCHED_BFS
 	help
 	  This option adds support for automatic NUMA aware memory/task placement.
 	  The mechanism is quite primitive and is based on migrating memory when
@@ -1032,9 +1049,13 @@ menuconfig CGROUP_SCHED
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups. It uses cgroups to group
-	  tasks.
+	  tasks. In combination with BFS this is purely a STUB to create the
+	  files associated with the CPU controller cgroup but most of the
+	  controls do nothing. This is useful for working in environments and
+	  with applications that will only work if this control group is
+	  present.
 
-if CGROUP_SCHED
+if CGROUP_SCHED && !SCHED_BFS
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on CGROUP_SCHED
@@ -1130,6 +1151,7 @@ config CGROUP_DEVICE
 
 config CGROUP_CPUACCT
 	bool "Simple CPU accounting controller"
+	depends on !SCHED_BFS
 	help
 	  Provides a simple controller for monitoring the
 	  total CPU consumed by the tasks in a cgroup.
@@ -1228,6 +1250,7 @@ endif # NAMESPACES
 
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
+	depends on !SCHED_BFS
 	select CGROUPS
 	select CGROUP_SCHED
 	select FAIR_GROUP_SCHED
diff --git a/init/main.c b/init/main.c
index a8a58e2..76a2d14 100644
--- a/init/main.c
+++ b/init/main.c
@@ -792,7 +792,6 @@ int __init_or_module do_one_initcall(initcall_t fn)
 	return ret;
 }
 
-
 extern initcall_t __initcall_start[];
 extern initcall_t __initcall0_start[];
 extern initcall_t __initcall1_start[];
@@ -951,6 +950,8 @@ static int __ref kernel_init(void *unused)
 
 	rcu_end_inkernel_boot();
 
+	print_scheduler_version();
+
 	if (ramdisk_execute_command) {
 		ret = run_init_process(ramdisk_execute_command);
 		if (!ret)
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e..dd84575 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o
+	    async.o range.o smpboot.o skip_lists.o
 
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a..a80d56dc 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->se.sum_exec_runtime;
+	t3 = tsk_seruntime(tsk);
 
 	d->cpu_count += t1;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 091a78b..c22b37f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -134,7 +134,7 @@ static void __exit_signal(struct task_struct *tsk)
 	sig->inblock += task_io_get_inblock(tsk);
 	sig->oublock += task_io_get_oublock(tsk);
 	task_io_accounting_add(&sig->ioac, &tsk->ioac);
-	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+	sig->sum_sched_runtime += tsk_seruntime(tsk);
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
 	write_sequnlock(&sig->stats_lock);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b83..8c1204f 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -15,13 +15,18 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+ifdef CONFIG_SCHED_BFS
+obj-y += bfs.o clock.o
+else
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+endif
+obj-y += wait.o swait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954..2ebd7b0 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -9,7 +9,11 @@
  * published by the Free Software Foundation.
  */
 
+#ifdef CONFIG_SCHED_BFS
+#include "bfs_sched.h"
+#else
 #include "sched.h"
+#endif
 
 DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a84641b..eba226d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -16,7 +16,11 @@
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
+#ifdef CONFIG_SCHED_BFS
+#include "bfs_sched.h"
+#else
 #include "sched.h"
+#endif
 
 struct sugov_tunables {
 	struct gov_attr_set attr_set;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9fb873c..1e855dc 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -14,7 +14,11 @@
 
 #include <trace/events/power.h>
 
+#ifdef CONFIG_SCHED_BFS
+#include "bfs_sched.h"
+#else
 #include "sched.h"
+#endif
 
 /**
  * sched_idle_set_state - Record idle state for the current CPU.
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f..7466a0b 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,7 +4,11 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 
+#ifndef CONFIG_SCHED_BFS
 #include "sched.h"
+#else
+#include "bfs_sched.h"
+#endif
 
 /*
  * bump this up when changing the output format or the meaning of an existing
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 13bc43d..fc0d8270 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)
 
 		if (kthread_should_park()) {
 			__set_current_state(TASK_RUNNING);
-			preempt_enable();
 			if (ht->park && td->status == HP_THREAD_ACTIVE) {
 				BUG_ON(td->cpu != smp_processor_id());
 				ht->park(td->cpu);
 				td->status = HP_THREAD_PARKED;
 			}
+			preempt_enable();
 			kthread_parkme();
 			/* We might have been woken for stop */
 			continue;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a13bbda..ca8093e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,8 +125,13 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
-static int one_hundred = 100;
-static int one_thousand = 1000;
+static int __read_mostly one_hundred = 100;
+static int __read_mostly one_thousand = 1000;
+#ifdef CONFIG_SCHED_BFS
+extern int rr_interval;
+extern int sched_interactive;
+extern int sched_iso_cpu;
+#endif
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -264,7 +269,7 @@ static struct ctl_table sysctl_base_table[] = {
 	{ }
 };
 
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
@@ -281,6 +286,7 @@ static int max_extfrag_threshold = 1000;
 #endif
 
 static struct ctl_table kern_table[] = {
+#ifndef CONFIG_SCHED_BFS
 	{
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
@@ -449,6 +455,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#endif /* !CONFIG_SCHED_BFS */
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
@@ -1013,6 +1020,35 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_SCHED_BFS
+	{
+		.procname	= "rr_interval",
+		.data		= &rr_interval,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "interactive",
+		.data		= &sched_interactive,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "iso_cpu",
+		.data		= &sched_iso_cpu,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
 		.procname	= "spin_retry",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 4008d9f..6931b6e 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -89,7 +89,7 @@ config NO_HZ_IDLE
 config NO_HZ_FULL
 	bool "Full dynticks system (tickless)"
 	# NO_HZ_COMMON dependency
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
 	# We need at least one periodic CPU for timekeeping
 	depends on SMP
 	depends on HAVE_CONTEXT_TRACKING
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d7..784f3a1 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -447,7 +447,7 @@ static void cleanup_timers(struct list_head *head)
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+	add_device_randomness((const void*) &tsk_seruntime(tsk),
 						sizeof(unsigned long long));
 	cleanup_timers(tsk->cpu_timers);
 
@@ -848,7 +848,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	tsk_expires->virt_exp = expires_to_cputime(expires);
 
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
-						   tsk->se.sum_exec_runtime);
+						   tsk_seruntime(tsk));
 
 	/*
 	 * Check for the special case thread timers.
@@ -859,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
-		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -867,7 +867,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -1115,7 +1115,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 		struct task_cputime task_sample;
 
 		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
-		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
+		task_sample.sum_exec_runtime = tsk_seruntime(tsk);
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
 			return 1;
 	}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea..287cf72 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1039,10 +1039,15 @@ static int trace_wakeup_test_thread(void *data)
 {
 	/* Make this a -deadline thread */
 	static const struct sched_attr attr = {
+#ifdef CONFIG_SCHED_BFS
+		/* No deadline on BFS, use RR */
+		.sched_policy = SCHED_RR,
+#else
 		.sched_policy = SCHED_DEADLINE,
 		.sched_runtime = 100000ULL,
 		.sched_deadline = 10000000ULL,
 		.sched_period = 10000000ULL
+#endif
 	};
 	struct wakeup_test_data *x = data;
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cab7405..4f7faee 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1323,7 +1323,7 @@ config RCU_PERF_TEST
 
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !SCHED_BFS
 	select TORTURE_TEST
 	select SRCU
 	select TASKS_RCU
-- 
2.7.4