---
 include/linux/swap-prefetch.h |    3 +
 kernel/sysctl.c               |   16 +++++++
 mm/swap_prefetch.c            |   88 +++++++++++++++++++++++-------------------
 3 files changed, 68 insertions(+), 39 deletions(-)

Index: linux-2.6.21-ck3/include/linux/swap-prefetch.h
===================================================================
--- linux-2.6.21-ck3.orig/include/linux/swap-prefetch.h	2007-05-25 16:22:02.000000000 +1000
+++ linux-2.6.21-ck3/include/linux/swap-prefetch.h	2007-05-25 16:27:06.000000000 +1000
@@ -4,6 +4,9 @@
 #ifdef CONFIG_SWAP_PREFETCH
 /* mm/swap_prefetch.c */
 extern int swap_prefetch;
+extern int swap_prefetch_delay;
+extern int swap_prefetch_sleep;
+
 struct swapped_entry {
 	swp_entry_t		swp_entry;	/* The actual swap entry */
 	struct list_head	swapped_list;	/* Linked list of entries */
Index: linux-2.6.21-ck3/kernel/sysctl.c
===================================================================
--- linux-2.6.21-ck3.orig/kernel/sysctl.c	2007-05-25 16:22:08.000000000 +1000
+++ linux-2.6.21-ck3/kernel/sysctl.c	2007-05-25 16:28:36.000000000 +1000
@@ -933,6 +933,22 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "swap_prefetch_delay",
+		.data		= &swap_prefetch_delay,
+		.maxlen		= sizeof(swap_prefetch_delay),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "swap_prefetch_sleep",
+		.data		= &swap_prefetch_sleep,
+		.maxlen		= sizeof(swap_prefetch_sleep),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
Index: linux-2.6.21-ck3/mm/swap_prefetch.c
===================================================================
--- linux-2.6.21-ck3.orig/mm/swap_prefetch.c	2007-05-25 16:22:02.000000000 +1000
+++ linux-2.6.21-ck3/mm/swap_prefetch.c	2007-05-25 19:12:21.000000000 +1000
@@ -23,15 +23,23 @@
 #include <linux/freezer.h>
 
 /*
- * Time to delay prefetching if vm is busy or prefetching unsuccessful. There
- * needs to be at least this duration of idle time meaning in practice it can
- * be much longer
- */
-#define PREFETCH_DELAY		(HZ * 5)
-#define DISABLED_PREFETCH_DELAY	(HZ * 60)
-
-/* sysctl - enable/disable swap prefetching */
+* sysctls:
+* swap_prefetch:	0. Disable swap prefetching
+*			1. Prefetch only when idle and not with laptop_mode
+*			2. Prefetch when idle and with laptop_mode
+*			3. Prefetch at all times.
+* swap_prefetch_delay:	Number of seconds to delay prefetching when system
+*			is not idle.
+* swap_prefetch_sleep:	Number of seconds to put kprefetchd to sleep when
+*			unable to prefetch.
+*/
 int swap_prefetch __read_mostly = 1;
+int swap_prefetch_delay __read_mostly = 1;
+int swap_prefetch_sleep __read_mostly = 5;
+
+/* Time to delay prefetching if vm is busy or prefetching unsuccessful.*/
+#define PREFETCH_DELAY		(HZ * swap_prefetch_delay)
+#define PREFETCH_SLEEP		((HZ * swap_prefetch_sleep) ? : 1)
 
 struct swapped_root {
 	unsigned long		busy;		/* vm busy */
@@ -73,7 +81,6 @@
 	return 1;
 }
 
-static int wakeup_kprefetchd;
 
 /*
  * Drop behind accounting which keeps a list of the most recently used swap
@@ -81,6 +88,7 @@
  */
 void add_to_swapped_list(struct page *page)
 {
+	static int wakeup_kprefetchd = 1;
 	struct swapped_entry *entry;
 	unsigned long index, flags;
 
@@ -123,8 +131,10 @@
 out_locked:
 	spin_unlock_irqrestore(&swapped.lock, flags);
 out:
-	if (wakeup_kprefetchd)
+	if (unlikely(wakeup_kprefetchd)) {
+		wakeup_kprefetchd = 0;
 		wake_up_process(kprefetchd_task);
+	}
 	return;
 }
 
@@ -284,7 +294,7 @@
 		if (!populated_zone(z))
 			continue;
 
-		ns = &sp_stat.node[z->zone_pgdat->node_id];
+		ns = &sp_stat.node[zone_to_nid(z)];
 		idx = zone_idx(z);
 		ns->lowfree[idx] = z->pages_high * 3;
 		ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
@@ -303,11 +313,19 @@
 /*
  * We want to be absolutely certain it's ok to start prefetching.
  */
-static int prefetch_suitable(void)
+static enum trickle_return prefetch_suitable(void)
 {
+	enum trickle_return ret = TRICKLE_DELAY;
+	int node, test_pagestate = 0;
 	unsigned long limit;
 	struct zone *z;
-	int node, ret = 0, test_pagestate = 0;
+
+	/*
+	 * swap_prefetch is set to a high value implying we ignore load
+	 * and prefetch whenever we can.
+	 */
+	if (swap_prefetch > 2)
+		goto ignore_busy;
 
 	/* Purposefully racy */
 	if (test_bit(0, &swapped.busy)) {
@@ -328,6 +346,7 @@
 		test_pagestate = 1;
 	}
 
+ignore_busy:
 	clear_current_prefetch_free();
 
 	/*
@@ -342,7 +361,7 @@
 		if (!populated_zone(z))
 			continue;
 
-		node = z->zone_pgdat->node_id;
+		node = zone_to_nid(z);
 		ns = &sp_stat.node[node];
 		idx = zone_idx(z);
 
@@ -408,11 +427,12 @@
 		}
 	}
 
+	/* Nothing suitable, put kprefetchd back to sleep */
 	if (nodes_empty(sp_stat.prefetch_nodes))
-		goto out;
+		return TRICKLE_FAILED;
 
 	/* Survived all that? Hooray we can prefetch! */
-	ret = 1;
+	ret = TRICKLE_SUCCESS;
 out:
 	return ret;
 }
@@ -427,7 +447,7 @@
  */
 static enum trickle_return trickle_swap(void)
 {
-	enum trickle_return ret = TRICKLE_DELAY;
+	enum trickle_return suitable, ret = TRICKLE_DELAY;
 	struct swapped_entry *pos, *n;
 	unsigned long flags;
 
@@ -435,8 +455,9 @@
 		return ret;
 
 	examine_free_limits();
-	if (!prefetch_suitable())
-		return ret;
+	suitable = prefetch_suitable();
+	if (suitable != TRICKLE_SUCCESS)
+		return suitable;
 	if (list_empty(&swapped.list))
 		return TRICKLE_FAILED;
 
@@ -446,8 +467,8 @@
 		int node;
 
 		spin_unlock_irqrestore(&swapped.lock, flags);
-		/* Yield to anything else running */
-		if (cond_resched() || !prefetch_suitable())
+		cond_resched();
+		if (!prefetch_suitable())
 			goto out_unlocked;
 
 		spin_lock_irqsave(&swapped.lock, flags);
@@ -488,28 +509,17 @@
 	sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE);
 
 	/* kprefetchd has nothing to do until it is woken up the first time */
-	wakeup_kprefetchd = 1;
 	set_current_state(TASK_INTERRUPTIBLE);
 	schedule();
 
 	while (!kthread_should_stop()) {
 		try_to_freeze();
 
-		/*
-		 * TRICKLE_FAILED implies no entries left - we do not schedule
-		 * a wakeup, and further delay the next one.
-		 */
-		if (trickle_swap() == TRICKLE_FAILED) {
-			wakeup_kprefetchd = 1;
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
-		} else
-			wakeup_kprefetchd = 0;
-		clear_last_prefetch_free();
-		if (!prefetch_enabled())
-			schedule_timeout_interruptible(DISABLED_PREFETCH_DELAY);
+		if (trickle_swap() == TRICKLE_FAILED || !prefetch_enabled())
+			schedule_timeout_interruptible(PREFETCH_SLEEP);
 		else
 			schedule_timeout_interruptible(PREFETCH_DELAY);
+		clear_last_prefetch_free();
 	}
 	return 0;
 }
@@ -525,10 +535,10 @@
 		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
 
 	/*
-	 * Set max number of entries to 2/3 the size of physical ram  as we
-	 * only ever prefetch to consume 2/3 of the ram.
+	 * We set the limit to more entries than the physical ram. As
+	 * we remove entries lazily so we need some headroom.
 	 */
-	swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
+	swapped.maxcount = nr_free_pagecache_pages() * 2;
 
 	for_each_zone(zone) {
 		unsigned long present;
@@ -539,7 +549,7 @@
 		if (!present)
 			continue;
 
-		ns = &sp_stat.node[zone->zone_pgdat->node_id];
+		ns = &sp_stat.node[zone_to_nid(zone)];
 		ns->prefetch_watermark += present / 3 * 2;
 		idx = zone_idx(zone);
 		ns->pointfree[idx] = &ns->highfree[idx];
