Implement no-idle-hz aka dynticks arch independent infrastructure.

Original code by Tony Lindgen <tony@atomide.com> and
Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>.

Rewritten and updated by Con Kolivas <kernel@kolivas.org>

Create the function timer_dyn_reprogram which should be called from the idle
loop. This function determines the next timer interrupt due for the current
cpu and if it is more than a minimum specified by the set_dyntick_limits
function it will call arch_reprogram for that cpu, passing the ticks to skip.
If the above function detects that all cpus are idle it calls
arch_all_cpus_idle and passes the ticks that all cpus are expecting to skip.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

 include/linux/dyntick.h |  158 +++++++++++++++++++++++
 include/linux/timer.h   |    1 
 kernel/Makefile         |    1 
 kernel/dyntick.c        |  322 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c          |    7 +
 5 files changed, 489 insertions(+)

Index: linux-2.6.16-rc5-dt/include/linux/timer.h
===================================================================
--- linux-2.6.16-rc5-dt.orig/include/linux/timer.h	2006-02-27 16:40:27.000000000 +1100
+++ linux-2.6.16-rc5-dt/include/linux/timer.h	2006-02-27 20:30:47.000000000 +1100
@@ -97,5 +97,6 @@ static inline void add_timer(struct time
 extern void init_timers(void);
 extern void run_local_timers(void);
 extern int it_real_fn(void *);
+extern void conditional_run_local_timers(void);
 
 #endif
Index: linux-2.6.16-rc5-dt/kernel/Makefile
===================================================================
--- linux-2.6.16-rc5-dt.orig/kernel/Makefile	2006-02-27 16:40:27.000000000 +1100
+++ linux-2.6.16-rc5-dt/kernel/Makefile	2006-02-27 20:30:47.000000000 +1100
@@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_NO_IDLE_HZ) += dyntick.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6.16-rc5-dt/kernel/timer.c
===================================================================
--- linux-2.6.16-rc5-dt.orig/kernel/timer.c	2006-02-27 16:40:27.000000000 +1100
+++ linux-2.6.16-rc5-dt/kernel/timer.c	2006-02-27 20:30:47.000000000 +1100
@@ -900,6 +900,13 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
+void conditional_run_local_timers(void)
+{
+	tvec_base_t *base  = &__get_cpu_var(tvec_bases);
+
+	if (base->timer_jiffies != jiffies)
+		run_local_timers();
+}
 /*
  * Called by the timer interrupt. xtime_lock must already be taken
  * by the timer IRQ!
Index: linux-2.6.16-rc5-dt/include/linux/dyntick.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16-rc5-dt/include/linux/dyntick.h	2006-02-27 20:30:47.000000000 +1100
@@ -0,0 +1,158 @@
+/*
+ * linux/include/linux/dyntick.h
+ *
+ * Copyright (C) 2004 Nokia Corporation
+ * Written by Tony Lindgen <tony@atomide.com> and
+ * Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>
+ * Rewritten by Con Kolivas <kernel@kolivas.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _dyntick_TIMER_H
+#define _dyntick_TIMER_H
+
+#include <linux/interrupt.h>
+
+#define dyntick_SKIPPING	(1 << 2)
+#define dyntick_ENABLED		(1 << 1)
+#define dyntick_SUITABLE	(1 << 0)
+
+/* Don't skip longer than NMI  */
+#define dyntick_MAX_SKIP	(HZ * 5)
+
+struct dyntick_timer {
+	spinlock_t lock;
+
+	/* dyntick init */
+	int (*arch_init)(void);
+	/* Enables dynamic tick */
+	int (*arch_enable)(void);
+	/* Disables dynamic tick */
+	int (*arch_disable)(void);
+	/* Reprograms the timer */
+	void (*arch_reprogram)(unsigned long);
+	/* Function called when all cpus are idle, passing the idle duration */
+	void (*arch_all_cpus_idle)(unsigned int);
+
+	unsigned short state;		/* Current state */
+	unsigned short min_skip;	/* Min number of ticks to skip */
+	unsigned int max_skip;		/* Max number of ticks to skip */
+	unsigned long tick;		/* The next earliest tick */
+};
+
+typedef struct {
+	unsigned long next_tick;	/* Next tick we're skipping to */
+	unsigned long skip;		/* Ticks we're currently skipping */
+	unsigned int nohz_cpu;		/* This cpu is idle */
+} dyntick_data;
+
+extern struct dyntick_timer *dyntick;
+extern spinlock_t *dyntick_lock;
+
+extern void dyntick_register(struct dyntick_timer *new_timer);
+
+#ifdef CONFIG_NO_IDLE_HZ
+DECLARE_PER_CPU(dyntick_data, dyn_cpu);
+extern dyntick_data dyn_cpu;
+extern int dyntick_enabled(void);
+extern int dyntick_skipping(void);
+extern int dyntick_allcpus_skipping(void);
+extern int dyntick_current_skip(void);
+extern unsigned long dyntick_next_tick(void);
+extern void timer_dyn_reprogram(void);
+extern void dyn_early_reprogram(unsigned int delta);
+extern void set_dyntick_limits(unsigned int max_skip, unsigned int min_skip);
+
+/*
+ * The apparently redundant per_cpu nohz_cpu value is tested in this
+ * function and this is where we can avoid the cache thrashing of testing
+ * nohz_cpu_mask when possible. Preempt must be disabled already.
+ */
+static inline int test_nohz_cpu(void)
+{
+	return __get_cpu_var(dyn_cpu).nohz_cpu;
+}
+
+/*
+ * This cpu is busy, clear the nohz_cpu value, test to see if all were idle
+ * till now. Returns whether all cpus were idle or not. Preempt should be
+ * disabled.
+ */
+static inline int clear_nohz_cpu(int cpu)
+{
+	int ret = 0;
+
+	if (!test_nohz_cpu())
+		return ret;
+	dyntick->tick = 0;
+	if (cpus_equal(nohz_cpu_mask, cpu_online_map)) {
+		dyntick->state &= ~dyntick_SKIPPING;
+		ret = 1;
+	}
+	__get_cpu_var(dyn_cpu).next_tick = 0;
+	__get_cpu_var(dyn_cpu).nohz_cpu = 0;
+	cpu_clear(cpu, nohz_cpu_mask);
+	return ret;
+}
+
+/*
+ * This cpu has fallen idle, set the nohz_cpu value, test to see if all are
+ * idle, and if so do dyntick->arch_all_cpus_idle(). Preempt should be
+ * disabled.
+ */
+static inline void set_nohz_cpu(int cpu)
+{
+	if (dyntick->tick <= jiffies ||
+		__get_cpu_var(dyn_cpu).next_tick < dyntick->tick)
+			dyntick->tick = __get_cpu_var(dyn_cpu).next_tick;
+
+	if (!test_nohz_cpu()) {
+		__get_cpu_var(dyn_cpu).nohz_cpu = 1;
+		cpu_set(cpu, nohz_cpu_mask);
+	}
+	if (cpus_equal(nohz_cpu_mask, cpu_online_map)) {
+		dyntick->state |= dyntick_SKIPPING;
+		dyntick->arch_all_cpus_idle(dyntick->tick - jiffies);
+	}
+}
+
+#else	/* CONFIG_NO_IDLE_HZ */
+static inline int dyntick_enabled(void)
+{
+	return 0;
+}
+
+static inline int dyntick_skipping(void)
+{
+	return 0;
+}
+
+static inline int dyntick_allcpus_skipping(void)
+{
+	return 0;
+}
+
+static inline int dyntick_current_skip(void)
+{
+	return 0;
+}
+
+static inline unsigned long dyntick_next_tick(void)
+{
+	return 0;
+}
+
+static inline void set_dyntick_limits(unsigned int max_skip,
+	unsigned int min_skip)
+{
+}
+
+static inline void dyn_early_reprogram(unsigned int delta)
+{
+}
+#endif	/* CONFIG_NO_IDLE_HZ */
+
+#endif	/* _dyntick_TIMER_H */
Index: linux-2.6.16-rc5-dt/kernel/dyntick.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16-rc5-dt/kernel/dyntick.c	2006-02-27 20:30:47.000000000 +1100
@@ -0,0 +1,322 @@
+/*
+ * linux/kernel/dyntick.c
+ *
+ * Generic dynamic tick timer support
+ *
+ * Copyright (C) 2004 Nokia Corporation
+ * Written by Tony Lindgen <tony@atomide.com> and
+ * Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>
+ * Rewritten by Con Kolivas <kernel@kolivas.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/version.h>
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/pm.h>
+#include <linux/dyntick.h>
+#include <linux/rcupdate.h>
+#include <asm/dyntick.h>
+
+#define dyntick_VERSION	"060227"
+
+DEFINE_PER_CPU(dyntick_data, dyn_cpu);
+
+inline int dyntick_enabled(void)
+{
+	return !!(dyntick->state & dyntick_ENABLED);
+}
+
+EXPORT_SYMBOL(dyntick_enabled);
+
+
+/*
+ * Returns if we are currently skipping ticks. Cheaper to check the per_cpu
+ * data than to lock dyn_tick->lock and then check dyn_tick->state.
+ */
+static int __dyntick_skipping(void)
+{
+	return time_after(__get_cpu_var(dyn_cpu).next_tick, jiffies);
+}
+
+int dyntick_skipping(void)
+{
+	int ret;
+
+	preempt_disable();
+	ret = __dyntick_skipping();
+	preempt_enable_no_resched();
+	return ret;
+}
+
+EXPORT_SYMBOL(dyntick_skipping);
+
+inline int dyntick_allcpus_skipping(void)
+{
+	return !!(dyntick->state & dyntick_SKIPPING);
+}
+
+EXPORT_SYMBOL(dyntick_allcpus_skipping);
+
+/*
+ * Returns the number of ticks we are currently skipping if we are skipping
+ */
+int dyntick_current_skip(void)
+{
+	int ret = 0;
+
+	preempt_disable();
+	if (__dyntick_skipping())
+		ret = __get_cpu_var(dyn_cpu).skip;
+	preempt_enable_no_resched();
+	return ret;
+}
+
+EXPORT_SYMBOL(dyntick_current_skip);
+
+/*
+ * Returns the next scheduled dyntick if we are skipping ticks.
+ */
+unsigned long dyntick_next_tick(void)
+{
+	unsigned long next = 0;
+
+	preempt_disable();
+	if (__dyntick_skipping())
+		next = __get_cpu_var(dyn_cpu).next_tick;
+	preempt_enable_no_resched();
+	return next;
+}
+
+EXPORT_SYMBOL(dyntick_next_tick);
+
+static inline int dyntick_suitable(void)
+{
+	return !!(dyntick->state & dyntick_SUITABLE);
+}
+
+/*
+ * do_dyn_reprogram does the actual reprogramming. We should have already
+ * checked that the tick chosen is suitable, xtime_lock and dyntick->lock
+ * must be held, and interrupts disabled. We should already have checked
+ * that jiffies + delta does not overflow.
+ */
+static void do_dyn_reprogram(long delta)
+{
+	unsigned long next = jiffies + delta;
+
+	__get_cpu_var(dyn_cpu).next_tick = next;
+	__get_cpu_var(dyn_cpu).skip = delta;
+	dyntick->arch_reprogram(next);
+	/* We have to update the idle_timestamp */
+	set_irq_idle_timestamp(next);
+}
+
+/*
+ * Arch independent code needed to reprogram next timer interrupt.
+ * Gets called, with IRQs disabled, from cpu_idle() before entering idle loop
+ * If we are to acquire the xtime_lock we must acquire it before
+ * dyntick->lock
+ */
+void timer_dyn_reprogram(void)
+{
+	int cpu;
+	long delta;
+
+	if (!dyntick_enabled())
+		return;
+
+	cpu = smp_processor_id();
+	if (rcu_pending(cpu) || local_softirq_pending()) {
+		spin_lock(&dyntick->lock);
+		clear_nohz_cpu(cpu);
+		spin_unlock(&dyntick->lock);
+		return;
+	}
+
+	write_seqlock(&xtime_lock);
+	delta = next_timer_interrupt() - jiffies;
+	/*
+	 * If delta < 0 we have jiffy wrap so it would be a bad time to
+	 * reprogram without overflowing variables elsewhere. The check for
+	 * > min_skip also effectively prevents this.
+	 */
+	if (delta > dyntick->min_skip) {
+		if (delta > dyntick->max_skip)
+			delta = dyntick->max_skip;
+		spin_lock(&dyntick->lock);
+		do_dyn_reprogram(delta);
+		set_nohz_cpu(cpu);
+		spin_unlock(&dyntick->lock);
+	}
+	write_sequnlock(&xtime_lock);
+}
+
+/*
+ * dyn_early_reprogram allows other code such as the acpi idle code to
+ * program an earlier tick than the one already chosen by timer_dyn_reprogram.
+ * It only reprograms it if the tick is earlier than the next one planned.
+ * Other code ensures we won't be skipping over jiffy wrap so it doesn't
+ * matter if jiffies + delta overflows as it will never be within the skipping
+ * period.
+ */
+void dyn_early_reprogram(unsigned int delta)
+{
+	unsigned long flags, tick = jiffies + delta;
+
+	preempt_disable();
+	if (!__dyntick_skipping() || !time_after(tick, jiffies) ||
+		time_after_eq(tick, __get_cpu_var(dyn_cpu).next_tick))
+			goto put_out;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	spin_lock(&dyntick->lock);
+	do_dyn_reprogram(delta);
+	spin_unlock(&dyntick->lock);
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+put_out:
+	preempt_enable_no_resched();
+}
+
+EXPORT_SYMBOL(dyn_early_reprogram);
+
+/*
+ * Set limits on minimum and maximum number of ticks to skip. The minimum
+ * may want to be set by other code but is at least one tick.
+ */
+void set_dyntick_limits(unsigned int max_skip, unsigned int min_skip)
+{
+	if (max_skip > dyntick_MAX_SKIP)
+		max_skip = dyntick_MAX_SKIP;
+	if (!dyntick->max_skip || max_skip < dyntick->max_skip)
+		dyntick->max_skip = max_skip;
+	if (min_skip < 1)
+		min_skip = 1;
+	if (min_skip > dyntick->min_skip)
+		dyntick->min_skip = min_skip;
+}
+
+void __init dyntick_register(struct dyntick_timer *arch_timer)
+{
+	dyntick = arch_timer;
+}
+
+/* Default to enabled */
+static int __initdata dyntick_autoenable = 1;
+
+/*
+ * Command line options.
+ *
+ *  dyntick=[enable|disable]
+ */
+static int __init dyntick_setup(char *options)
+{
+	if (!options)
+		return 0;
+
+	if (!strncmp(options, "enable", 5))
+		dyntick_autoenable = 1;
+	if (!strncmp(options, "disable", 6))
+		dyntick_autoenable = 0;
+
+	return 0;
+}
+
+__setup("dyntick=", dyntick_setup);
+
+/*
+ * Sysfs interface.
+ *
+ * Usually situated at:
+ *  /sys/devices/system/timer/timer0/dyntick
+ */
+extern struct sys_device device_timer;
+
+static ssize_t timer_show_dyntick(struct sys_device *dev, char *buf)
+{
+	return sprintf(buf, "%i\n", dyntick_enabled());
+}
+
+static ssize_t timer_set_dyntick(struct sys_device *dev, const char *buf,
+				  size_t count)
+{
+	unsigned long flags;
+	unsigned int enable = simple_strtoul(buf, NULL, 2);
+	int ret = -ENODEV;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	if (enable) {
+		ret = dyntick->arch_enable();
+		if (ret == 0) {
+			spin_lock(&dyntick->lock);
+			dyntick->state |= dyntick_ENABLED;
+			spin_unlock(&dyntick->lock);
+			printk(KERN_INFO
+				"dyntick: Enabling dynamic tick timer \n");
+		}
+	} else {
+		ret = dyntick->arch_disable();
+		if (ret == 0) {
+			spin_lock(&dyntick->lock);
+			dyntick->state &= ~dyntick_ENABLED;
+			spin_unlock(&dyntick->lock);
+			printk(KERN_INFO
+				"dyntick: Disabling dynamic tick timer \n");
+		}
+	}
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	return count;
+}
+
+static SYSDEV_ATTR(dyntick, 0644, timer_show_dyntick, timer_set_dyntick);
+
+static int __init init_dyntick_sysfs(void)
+{
+	int ret = sysdev_create_file(&device_timer, &attr_dyntick);
+
+	return ret;
+}
+
+device_initcall(init_dyntick_sysfs);
+
+/*
+ * Init functions
+ *
+ * We need to initialise dynamic tick after calibrate delay
+ */
+static int __init dyntick_late_init(void)
+{
+	int ret;
+
+	if (dyntick == NULL || dyntick->arch_init == NULL ||
+	    !dyntick_suitable()) {
+		printk(KERN_ERR "dyntick: No suitable timer found\n");
+		return -ENODEV;
+	}
+
+	if ((ret = dyntick->arch_init())) {
+		printk(KERN_ERR "dyntick: Init failed\n");
+		return -ENODEV;
+	}
+
+	if (!ret && dyntick_autoenable) {
+		dyntick->state |= dyntick_ENABLED;
+		printk(KERN_INFO "dyntick: Enabling dynamic tick timer v%s\n",
+			dyntick_VERSION);
+	} else
+		printk(KERN_INFO "dyntick: Dynamic tick timer v%s disabled\n",
+			dyntick_VERSION);
+
+	return ret;
+}
+
+late_initcall(dyntick_late_init);
