Index: linux-2.6.21-sp/include/linux/sched.h
===================================================================
--- linux-2.6.21-sp.orig/include/linux/sched.h	2007-05-01 14:52:06.000000000 +1000
+++ linux-2.6.21-sp/include/linux/sched.h	2007-05-14 10:57:50.000000000 +1000
@@ -740,6 +740,22 @@ extern unsigned int max_cache_size;
 
 #endif	/* CONFIG_SMP */
 
+/*
+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
+ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
+ * task of nice 0 or enough lower priority tasks to bring up the
+ * weighted_cpuload
+ */
+static inline int above_background_load(void)
+{
+	unsigned long cpu;
+
+	for_each_online_cpu(cpu) {
+		if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
+			return 1;
+	}
+	return 0;
+}
 
 struct io_context;			/* See blkdev.h */
 struct cpuset;
Index: linux-2.6.21-sp/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.21-sp.orig/Documentation/sysctl/vm.txt	2007-02-14 09:09:32.000000000 +1100
+++ linux-2.6.21-sp/Documentation/sysctl/vm.txt	2007-05-14 10:59:14.000000000 +1000
@@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/
 - min_unmapped_ratio
 - min_slab_ratio
 - panic_on_oom
+- swap_prefetch
 
 ==============================================================
 
@@ -205,3 +206,16 @@ rather than killing rogue processes, set
 
 The default value is 0.
 
+==============================================================
+
+swap_prefetch
+
+This enables or disables the swap prefetching feature. When the virtual
+memory subsystem has been extremely idle for at least 5 seconds it will start
+copying back pages from swap into the swapcache and keep a copy in swap. In
+practice it can take many minutes before the vm is idle enough. A value of 0
+disables swap prefetching, 1 enables it unless laptop_mode is enabled, and 2
+enables it even in the presence of laptop_mode.
+
+The default value is 1.
+
Index: linux-2.6.21-sp/include/linux/swap.h
===================================================================
--- linux-2.6.21-sp.orig/include/linux/swap.h	2007-05-01 14:52:06.000000000 +1000
+++ linux-2.6.21-sp/include/linux/swap.h	2007-05-14 10:58:04.000000000 +1000
@@ -180,6 +180,7 @@ extern unsigned int nr_free_pagecache_pa
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(lru_cache_add_active(struct page *));
+extern void FASTCALL(lru_cache_add_tail(struct page *));
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
@@ -237,6 +238,7 @@ extern void free_pages_and_swap_cache(st
 extern struct page * lookup_swap_cache(swp_entry_t);
 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
 					   unsigned long addr);
+extern int add_to_swap_cache(struct page *page, swp_entry_t entry);
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
Index: linux-2.6.21-sp/init/Kconfig
===================================================================
--- linux-2.6.21-sp.orig/init/Kconfig	2007-05-01 14:52:06.000000000 +1000
+++ linux-2.6.21-sp/init/Kconfig	2007-05-14 10:58:04.000000000 +1000
@@ -101,6 +101,28 @@ config SWAP
 	  used to provide more virtual memory than the actual RAM present
 	  in your computer.  If unsure say Y.
 
+config SWAP_PREFETCH
+	bool "Support for prefetching swapped memory"
+	depends on SWAP
+	default y
+	---help---
+	  This option will allow the kernel to prefetch swapped memory pages
+	  when idle. The pages will be kept on both swap and in swap_cache
+	  thus avoiding the need for further I/O if either ram or swap space
+	  is required.
+
+	  What this will do on workstations is slowly bring back applications
+	  that have swapped out after memory intensive workloads back into
+	  physical ram if you have free ram at a later stage and the machine
+	  is relatively idle. This means that when you come back to your
+	  computer after leaving it idle for a while, applications will come
+	  to life faster. Note that your swap usage will appear to increase
+	  but these are cached pages, can be dropped freely by the vm, and it
+	  should stabilise around 50% swap usage maximum.
+
+	  Workstations and multiuser workstation servers will most likely want
+	  to say Y.
+
 config SYSVIPC
 	bool "System V IPC"
 	---help---
Index: linux-2.6.21-sp/kernel/sysctl.c
===================================================================
--- linux-2.6.21-sp.orig/kernel/sysctl.c	2007-05-01 14:52:06.000000000 +1000
+++ linux-2.6.21-sp/kernel/sysctl.c	2007-05-14 10:58:04.000000000 +1000
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/swap-prefetch.h>
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/capability.h>
@@ -859,6 +860,16 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+#ifdef CONFIG_SWAP_PREFETCH
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "swap_prefetch",
+		.data		= &swap_prefetch,
+		.maxlen		= sizeof(swap_prefetch),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.21-sp/mm/Makefile
===================================================================
--- linux-2.6.21-sp.orig/mm/Makefile	2006-11-30 08:57:37.000000000 +1100
+++ linux-2.6.21-sp/mm/Makefile	2007-05-14 10:58:04.000000000 +1000
@@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
 obj-y			+= bounce.o
 endif
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
Index: linux-2.6.21-sp/mm/swap.c
===================================================================
--- linux-2.6.21-sp.orig/mm/swap.c	2007-02-14 09:09:44.000000000 +1100
+++ linux-2.6.21-sp/mm/swap.c	2007-05-14 10:58:04.000000000 +1000
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
+#include <linux/swap-prefetch.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
@@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed);
  */
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
@@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc
 	put_cpu_var(lru_add_active_pvecs);
 }
 
+static void __pagevec_lru_add_tail(struct pagevec *pvec)
+{
+	int i;
+	struct zone *zone = NULL;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		struct zone *pagezone = page_zone(page);
+
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		add_page_to_inactive_list_tail(zone, page);
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	pagevec_reinit(pvec);
+}
+
 static void __lru_add_drain(int cpu)
 {
 	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
@@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu)
 	pvec = &per_cpu(lru_add_active_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
+	pvec = &per_cpu(lru_add_tail_pvecs, cpu);
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_tail(pvec);
 }
 
 void lru_add_drain(void)
@@ -403,6 +433,21 @@ void __pagevec_lru_add_active(struct pag
 }
 
 /*
+ * Function used uniquely to put pages back to the lru at the end of the
+ * inactive list to preserve the lru order. Currently only used by swap
+ * prefetch.
+ */
+void fastcall lru_cache_add_tail(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs);
+
+	page_cache_get(page);
+	if (!pagevec_add(pvec, page))
+		__pagevec_lru_add_tail(pvec);
+	put_cpu_var(lru_add_pvecs);
+}
+
+/*
  * Try to drop buffers from the pages in a pagevec
  */
 void pagevec_strip(struct pagevec *pvec)
@@ -514,6 +559,9 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+
+	prepare_swap_prefetch();
+
 #ifdef CONFIG_HOTPLUG_CPU
 	hotcpu_notifier(cpu_swap_callback, 0);
 #endif
Index: linux-2.6.21-sp/mm/swap_prefetch.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.21-sp/mm/swap_prefetch.c	2007-05-14 10:59:31.000000000 +1000
@@ -0,0 +1,562 @@
+/*
+ * linux/mm/swap_prefetch.c
+ *
+ * Copyright (C) 2005-2006 Con Kolivas
+ *
+ * Written by Con Kolivas <kernel@kolivas.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swap-prefetch.h>
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/writeback.h>
+#include <linux/vmstat.h>
+#include <linux/freezer.h>
+
+/*
+ * Time to delay prefetching if vm is busy or prefetching unsuccessful. There
+ * needs to be at least this duration of idle time meaning in practice it can
+ * be much longer
+ */
+#define PREFETCH_DELAY		(HZ * 5)
+#define DISABLED_PREFETCH_DELAY	(HZ * 60)
+
+/* sysctl - enable/disable swap prefetching */
+int swap_prefetch __read_mostly = 1;
+
+struct swapped_root {
+	unsigned long		busy;		/* vm busy */
+	spinlock_t		lock;		/* protects all data */
+	struct list_head	list;		/* MRU list of swapped pages */
+	struct radix_tree_root	swap_tree;	/* Lookup tree of pages */
+	unsigned int		count;		/* Number of entries */
+	unsigned int		maxcount;	/* Maximum entries allowed */
+	struct kmem_cache	*cache;		/* Of struct swapped_entry */
+};
+
+static struct swapped_root swapped = {
+	.lock		= SPIN_LOCK_UNLOCKED,
+	.list  		= LIST_HEAD_INIT(swapped.list),
+	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
+};
+
+static struct task_struct *kprefetchd_task;
+
+/*
+ * We check to see no part of the vm is busy. If it is this will interrupt
+ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
+ */
+inline void delay_swap_prefetch(void)
+{
+	if (!test_bit(0, &swapped.busy))
+		__set_bit(0, &swapped.busy);
+}
+
+/*
+ * If laptop_mode is enabled don't prefetch to avoid hard drives
+ * doing unnecessary spin-ups unless swap_prefetch is explicitly
+ * set to a higher value.
+ */
+static inline int prefetch_enabled(void)
+{
+	if (swap_prefetch <= laptop_mode)
+		return 0;
+	return 1;
+}
+
+static int wakeup_kprefetchd;
+
+/*
+ * Drop behind accounting which keeps a list of the most recently used swap
+ * entries. Entries are removed lazily by kprefetchd.
+ */
+void add_to_swapped_list(struct page *page)
+{
+	struct swapped_entry *entry;
+	unsigned long index, flags;
+
+	if (!prefetch_enabled())
+		goto out;
+
+	spin_lock_irqsave(&swapped.lock, flags);
+	if (swapped.count >= swapped.maxcount) {
+		/*
+		 * We limit the number of entries to 2/3 of physical ram.
+		 * Once the number of entries exceeds this we start removing
+		 * the least recently used entries.
+		 */
+		entry = list_entry(swapped.list.next,
+			struct swapped_entry, swapped_list);
+		radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
+		list_del(&entry->swapped_list);
+		swapped.count--;
+	} else {
+		entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
+		if (unlikely(!entry))
+			/* bad, can't allocate more mem */
+			goto out_locked;
+	}
+
+	index = page_private(page);
+	entry->swp_entry.val = index;
+	/*
+	 * On numa we need to store the node id to ensure that we prefetch to
+	 * the same node it came from.
+	 */
+	store_swap_entry_node(entry, page);
+
+	if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) {
+		list_add(&entry->swapped_list, &swapped.list);
+		swapped.count++;
+	} else
+		kmem_cache_free(swapped.cache, entry);
+
+out_locked:
+	spin_unlock_irqrestore(&swapped.lock, flags);
+out:
+	if (wakeup_kprefetchd)
+		wake_up_process(kprefetchd_task);
+	return;
+}
+
+/*
+ * Removes entries from the swapped_list. The radix tree allows us to quickly
+ * look up the entry from the index without having to iterate over the whole
+ * list.
+ */
+void remove_from_swapped_list(const unsigned long index)
+{
+	struct swapped_entry *entry;
+	unsigned long flags;
+
+	if (list_empty(&swapped.list))
+		return;
+
+	spin_lock_irqsave(&swapped.lock, flags);
+	entry = radix_tree_delete(&swapped.swap_tree, index);
+	if (likely(entry)) {
+		list_del(&entry->swapped_list);
+		swapped.count--;
+		kmem_cache_free(swapped.cache, entry);
+	}
+	spin_unlock_irqrestore(&swapped.lock, flags);
+}
+
+enum trickle_return {
+	TRICKLE_SUCCESS,
+	TRICKLE_FAILED,
+	TRICKLE_DELAY,
+};
+
+struct node_stats {
+	/* Free ram after a cycle of prefetching */
+	unsigned long	last_free;
+	/* Free ram on this cycle of checking prefetch_suitable */
+	unsigned long	current_free;
+	/* Maximum amount we will prefetch to */
+	unsigned long	prefetch_watermark;
+	/* The amount of free ram before we start prefetching */
+	unsigned long	highfree[MAX_NR_ZONES];
+	/* The amount of free ram where we will stop prefetching */
+	unsigned long	lowfree[MAX_NR_ZONES];
+	/* highfree or lowfree depending on whether we've hit a watermark */
+	unsigned long	*pointfree[MAX_NR_ZONES];
+};
+
+/*
+ * prefetch_stats stores the free ram data of each node and this is used to
+ * determine if a node is suitable for prefetching into.
+ */
+struct prefetch_stats {
+	/* Which nodes are currently suited to prefetching */
+	nodemask_t	prefetch_nodes;
+	/* Total pages we've prefetched on this wakeup of kprefetchd */
+	unsigned long	prefetched_pages;
+	struct node_stats node[MAX_NUMNODES];
+};
+
+static struct prefetch_stats sp_stat;
+
+/*
+ * This tries to read a swp_entry_t into swap cache for swap prefetching.
+ * If it returns TRICKLE_DELAY we should delay further prefetching.
+ */
+static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry,
+	const int node)
+{
+	enum trickle_return ret = TRICKLE_FAILED;
+	unsigned long flags;
+	struct page *page;
+
+	read_lock_irqsave(&swapper_space.tree_lock, flags);
+	/* Entry may already exist */
+	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+	read_unlock_irqrestore(&swapper_space.tree_lock, flags);
+	if (page)
+		goto out;
+
+	/*
+	 * Get a new page to read from swap. We have already checked the
+	 * watermarks so __alloc_pages will not call on reclaim.
+	 */
+	page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0);
+	if (unlikely(!page)) {
+		ret = TRICKLE_DELAY;
+		goto out;
+	}
+
+	if (add_to_swap_cache(page, entry)) {
+		/* Failed to add to swap cache */
+		goto out_release;
+	}
+
+	/* Add them to the tail of the inactive list to preserve LRU order */
+	lru_cache_add_tail(page);
+	if (unlikely(swap_readpage(NULL, page)))
+		goto out_release;
+
+	sp_stat.prefetched_pages++;
+	sp_stat.node[node].last_free--;
+
+	ret = TRICKLE_SUCCESS;
+out_release:
+	page_cache_release(page);
+out:
+	/*
+	 * All entries are removed here lazily. This avoids the cost of
+	 * remove_from_swapped_list during normal swapin. Thus there are
+	 * usually many stale entries.
+	 */
+	remove_from_swapped_list(entry.val);
+	return ret;
+}
+
+static void clear_last_prefetch_free(void)
+{
+	int node;
+
+	/*
+	 * Reset the nodes suitable for prefetching to all nodes. We could
+	 * update the data to take into account memory hotplug if desired..
+	 */
+	sp_stat.prefetch_nodes = node_online_map;
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+
+		ns->last_free = 0;
+	}
+}
+
+static void clear_current_prefetch_free(void)
+{
+	int node;
+
+	sp_stat.prefetch_nodes = node_online_map;
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+
+		ns->current_free = 0;
+	}
+}
+
+/*
+ * This updates the high and low watermarks of amount of free ram in each
+ * node used to start and stop prefetching. We prefetch from pages_high * 4
+ * down to pages_high * 3.
+ */
+static void examine_free_limits(void)
+{
+	struct zone *z;
+
+	for_each_zone(z) {
+		struct node_stats *ns;
+		int idx;
+
+		if (!populated_zone(z))
+			continue;
+
+		ns = &sp_stat.node[z->zone_pgdat->node_id];
+		idx = zone_idx(z);
+		ns->lowfree[idx] = z->pages_high * 3;
+		ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
+
+		if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) {
+			/*
+			 * We've gotten above the high watermark of free pages
+			 * so we can start prefetching till we get to the low
+			 * watermark.
+			 */
+			ns->pointfree[idx] = &ns->lowfree[idx];
+		}
+	}
+}
+
+/*
+ * We want to be absolutely certain it's ok to start prefetching.
+ */
+static int prefetch_suitable(void)
+{
+	unsigned long limit;
+	struct zone *z;
+	int node, ret = 0, test_pagestate = 0;
+
+	/* Purposefully racy */
+	if (test_bit(0, &swapped.busy)) {
+		__clear_bit(0, &swapped.busy);
+		goto out;
+	}
+
+	/*
+	 * get_page_state and above_background_load are expensive so we only
+	 * perform them every SWAP_CLUSTER_MAX prefetched_pages.
+	 * We test to see if we're above_background_load as disk activity
+	 * even at low priority can cause interrupt induced scheduling
+	 * latencies.
+	 */
+	if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
+		if (above_background_load())
+			goto out;
+		test_pagestate = 1;
+	}
+
+	clear_current_prefetch_free();
+
+	/*
+	 * Have some hysteresis between where page reclaiming and prefetching
+	 * will occur to prevent ping-ponging between them.
+	 */
+	for_each_zone(z) {
+		struct node_stats *ns;
+		unsigned long free;
+		int idx;
+
+		if (!populated_zone(z))
+			continue;
+
+		node = z->zone_pgdat->node_id;
+		ns = &sp_stat.node[node];
+		idx = zone_idx(z);
+
+		free = zone_page_state(z, NR_FREE_PAGES);
+		if (free < *ns->pointfree[idx]) {
+			/*
+			 * Free pages have dropped below the low watermark so
+			 * we won't start prefetching again till we hit the
+			 * high watermark of free pages.
+			 */
+			ns->pointfree[idx] = &ns->highfree[idx];
+			node_clear(node, sp_stat.prefetch_nodes);
+			continue;
+		}
+		ns->current_free += free;
+	}
+
+	/*
+	 * We iterate over each node testing to see if it is suitable for
+	 * prefetching and clear the nodemask if it is not.
+	 */
+	for_each_node_mask(node, sp_stat.prefetch_nodes) {
+		struct node_stats *ns = &sp_stat.node[node];
+
+		/*
+		 * We check to see that pages are not being allocated
+		 * elsewhere at any significant rate implying any
+		 * degree of memory pressure (eg during file reads)
+		 */
+		if (ns->last_free) {
+			if (ns->current_free + SWAP_CLUSTER_MAX <
+			    ns->last_free) {
+				ns->last_free = ns->current_free;
+				node_clear(node,
+					sp_stat.prefetch_nodes);
+				continue;
+			}
+		} else
+			ns->last_free = ns->current_free;
+
+		if (!test_pagestate)
+			continue;
+
+		/* We shouldn't prefetch when we are doing writeback */
+		if (node_page_state(node, NR_WRITEBACK)) {
+			node_clear(node, sp_stat.prefetch_nodes);
+			continue;
+		}
+
+		/*
+		 * >2/3 of the ram on this node is mapped, slab, swapcache or
+		 * dirty, we need to leave some free for pagecache.
+		 */
+		limit = node_page_state(node, NR_FILE_PAGES);
+		limit += node_page_state(node, NR_SLAB_RECLAIMABLE);
+		limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE);
+		limit += node_page_state(node, NR_FILE_DIRTY);
+		limit += node_page_state(node, NR_UNSTABLE_NFS);
+		limit += total_swapcache_pages;
+		if (limit > ns->prefetch_watermark) {
+			node_clear(node, sp_stat.prefetch_nodes);
+			continue;
+		}
+	}
+
+	if (nodes_empty(sp_stat.prefetch_nodes))
+		goto out;
+
+	/* Survived all that? Hooray we can prefetch! */
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * trickle_swap is the main function that initiates the swap prefetching. It
+ * first checks to see if the busy flag is set, and does not prefetch if it
+ * is, as the flag implied we are low on memory or swapping in currently.
+ * Otherwise it runs until prefetch_suitable fails which occurs when the
+ * vm is busy, we prefetch to the watermark, or the list is empty or we have
+ * iterated over all entries
+ */
+static enum trickle_return trickle_swap(void)
+{
+	enum trickle_return ret = TRICKLE_DELAY;
+	struct swapped_entry *pos, *n;
+	unsigned long flags;
+
+	if (!prefetch_enabled())
+		return ret;
+
+	examine_free_limits();
+	if (!prefetch_suitable())
+		return ret;
+	if (list_empty(&swapped.list))
+		return TRICKLE_FAILED;
+
+	spin_lock_irqsave(&swapped.lock, flags);
+	list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) {
+		swp_entry_t swp_entry;
+		int node;
+
+		spin_unlock_irqrestore(&swapped.lock, flags);
+		/* Yield to anything else running */
+		if (cond_resched() || !prefetch_suitable())
+			goto out_unlocked;
+
+		spin_lock_irqsave(&swapped.lock, flags);
+		if (unlikely(!pos))
+			continue;
+		node = get_swap_entry_node(pos);
+		if (!node_isset(node, sp_stat.prefetch_nodes)) {
+			/*
+			 * We found an entry that belongs to a node that is
+			 * not suitable for prefetching so skip it.
+			 */
+			continue;
+		}
+		swp_entry = pos->swp_entry;
+		spin_unlock_irqrestore(&swapped.lock, flags);
+
+		if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY)
+			goto out_unlocked;
+		spin_lock_irqsave(&swapped.lock, flags);
+	}
+	spin_unlock_irqrestore(&swapped.lock, flags);
+
+out_unlocked:
+	if (sp_stat.prefetched_pages) {
+		lru_add_drain();
+		sp_stat.prefetched_pages = 0;
+	}
+	return ret;
+}
+
+static int kprefetchd(void *__unused)
+{
+	struct sched_param param = { .sched_priority = 0 };
+
+	sched_setscheduler(current, SCHED_BATCH, &param);
+	set_user_nice(current, 19);
+	/* Set ioprio to lowest if supported by i/o scheduler */
+	sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE);
+
+	/* kprefetchd has nothing to do until it is woken up the first time */
+	wakeup_kprefetchd = 1;
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule();
+
+	while (!kthread_should_stop()) {
+		try_to_freeze();
+
+		/*
+		 * TRICKLE_FAILED implies no entries left - we do not schedule
+		 * a wakeup, and further delay the next one.
+		 */
+		if (trickle_swap() == TRICKLE_FAILED) {
+			wakeup_kprefetchd = 1;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+		} else
+			wakeup_kprefetchd = 0;
+		clear_last_prefetch_free();
+		if (!prefetch_enabled())
+			schedule_timeout_interruptible(DISABLED_PREFETCH_DELAY);
+		else
+			schedule_timeout_interruptible(PREFETCH_DELAY);
+	}
+	return 0;
+}
+
+/*
+ * Create kmem cache for swapped entries
+ */
+void __init prepare_swap_prefetch(void)
+{
+	struct zone *zone;
+
+	swapped.cache = kmem_cache_create("swapped_entry",
+		sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
+
+	/*
+	 * Set max number of entries to 2/3 the size of physical ram  as we
+	 * only ever prefetch to consume 2/3 of the ram.
+	 */
+	swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
+
+	for_each_zone(zone) {
+		unsigned long present;
+		struct node_stats *ns;
+		int idx;
+
+		present = zone->present_pages;
+		if (!present)
+			continue;
+
+		ns = &sp_stat.node[zone->zone_pgdat->node_id];
+		ns->prefetch_watermark += present / 3 * 2;
+		idx = zone_idx(zone);
+		ns->pointfree[idx] = &ns->highfree[idx];
+	}
+}
+
+static int __init kprefetchd_init(void)
+{
+	kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd");
+
+	return 0;
+}
+
+static void __exit kprefetchd_exit(void)
+{
+	kthread_stop(kprefetchd_task);
+}
+
+module_init(kprefetchd_init);
+module_exit(kprefetchd_exit);
Index: linux-2.6.21-sp/mm/swap_state.c
===================================================================
--- linux-2.6.21-sp.orig/mm/swap_state.c	2006-11-30 08:57:37.000000000 +1100
+++ linux-2.6.21-sp/mm/swap_state.c	2007-05-14 10:59:14.000000000 +1000
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
+#include <linux/swap-prefetch.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -95,7 +96,7 @@ static int __add_to_swap_cache(struct pa
 	return error;
 }
 
-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
+int add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
 	int error;
 
@@ -148,6 +149,9 @@ int add_to_swap(struct page * page, gfp_
 	swp_entry_t entry;
 	int err;
 
+	/* Swap prefetching is delayed if we're swapping pages */
+	delay_swap_prefetch();
+
 	BUG_ON(!PageLocked(page));
 
 	for (;;) {
@@ -320,6 +324,9 @@ struct page *read_swap_cache_async(swp_e
 	struct page *found_page, *new_page = NULL;
 	int err;
 
+	/* Swap prefetching is delayed if we're already reading from swap */
+	delay_swap_prefetch();
+
 	do {
 		/*
 		 * First check the swap cache.  Since this is normally
Index: linux-2.6.21-sp/mm/vmscan.c
===================================================================
--- linux-2.6.21-sp.orig/mm/vmscan.c	2007-05-01 14:52:06.000000000 +1000
+++ linux-2.6.21-sp/mm/vmscan.c	2007-05-14 10:59:14.000000000 +1000
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
+#include <linux/swap-prefetch.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
@@ -1032,6 +1033,8 @@ unsigned long try_to_free_pages(struct z
 		.swappiness = vm_swappiness,
 	};
 
+	delay_swap_prefetch();
+
 	count_vm_event(ALLOCSTALL);
 
 	for (i = 0; zones[i] != NULL; i++) {
@@ -1381,6 +1384,8 @@ static unsigned long shrink_all_zones(un
 	struct zone *zone;
 	unsigned long nr_to_scan, ret = 0;
 
+	delay_swap_prefetch();
+
 	for_each_zone(zone) {
 
 		if (!populated_zone(zone))
Index: linux-2.6.21-sp/include/linux/mm_inline.h
===================================================================
--- linux-2.6.21-sp.orig/include/linux/mm_inline.h	2007-05-01 14:52:06.000000000 +1000
+++ linux-2.6.21-sp/include/linux/mm_inline.h	2007-05-14 10:58:04.000000000 +1000
@@ -13,6 +13,13 @@ add_page_to_inactive_list(struct zone *z
 }
 
 static inline void
+add_page_to_inactive_list_tail(struct zone *zone, struct page *page)
+{
+	list_add_tail(&page->lru, &zone->inactive_list);
+	__inc_zone_state(zone, NR_INACTIVE);
+}
+
+static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
Index: linux-2.6.21-sp/include/linux/swap-prefetch.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.21-sp/include/linux/swap-prefetch.h	2007-05-14 10:58:04.000000000 +1000
@@ -0,0 +1,55 @@
+#ifndef SWAP_PREFETCH_H_INCLUDED
+#define SWAP_PREFETCH_H_INCLUDED
+
+#ifdef CONFIG_SWAP_PREFETCH
+/* mm/swap_prefetch.c */
+extern int swap_prefetch;
+struct swapped_entry {
+	swp_entry_t		swp_entry;	/* The actual swap entry */
+	struct list_head	swapped_list;	/* Linked list of entries */
+#if MAX_NUMNODES > 1
+	int			node;		/* Node id */
+#endif
+} __attribute__((packed));
+
+static inline void store_swap_entry_node(struct swapped_entry *entry,
+	struct page *page)
+{
+#if MAX_NUMNODES > 1
+	entry->node = page_to_nid(page);
+#endif
+}
+
+static inline int get_swap_entry_node(struct swapped_entry *entry)
+{
+#if MAX_NUMNODES > 1
+	return entry->node;
+#else
+	return 0;
+#endif
+}
+
+extern void add_to_swapped_list(struct page *page);
+extern void remove_from_swapped_list(const unsigned long index);
+extern void delay_swap_prefetch(void);
+extern void prepare_swap_prefetch(void);
+
+#else	/* CONFIG_SWAP_PREFETCH */
+static inline void add_to_swapped_list(struct page *__unused)
+{
+}
+
+static inline void prepare_swap_prefetch(void)
+{
+}
+
+static inline void remove_from_swapped_list(const unsigned long __unused)
+{
+}
+
+static inline void delay_swap_prefetch(void)
+{
+}
+#endif	/* CONFIG_SWAP_PREFETCH */
+
+#endif		/* SWAP_PREFETCH_H_INCLUDED */
Index: linux-2.6.21-sp/mm/page_io.c
===================================================================
--- linux-2.6.21-sp.orig/mm/page_io.c	2007-02-14 09:09:44.000000000 +1100
+++ linux-2.6.21-sp/mm/page_io.c	2007-05-14 10:59:14.000000000 +1000
@@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/swapops.h>
 #include <linux/writeback.h>
+#include <linux/swap-prefetch.h>
 #include <asm/pgtable.h>
 
 static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
@@ -118,6 +119,7 @@ int swap_writepage(struct page *page, st
 		ret = -ENOMEM;
 		goto out;
 	}
+	add_to_swapped_list(page);
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		rw |= (1 << BIO_RW_SYNC);
 	count_vm_event(PSWPOUT);
