Index: linux-2.6.13-ck7/include/linux/swap.h
===================================================================
--- linux-2.6.13-ck7.orig/include/linux/swap.h	2005-10-05 21:54:27.000000000 +1000
+++ linux-2.6.13-ck7/include/linux/swap.h	2005-10-05 21:54:28.000000000 +1000
@@ -197,6 +197,7 @@ extern void delay_prefetch(void);
 extern struct page *
 buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags);
 extern void zone_statistics(struct zonelist *zonelist, struct zone *z);
+extern int swap_prefetch;
 
 #else	/* CONFIG_SWAP_PREFETCH */
 static inline void add_to_swapped_list(unsigned long index)
Index: linux-2.6.13-ck7/include/linux/sysctl.h
===================================================================
--- linux-2.6.13-ck7.orig/include/linux/sysctl.h	2005-10-05 21:54:18.000000000 +1000
+++ linux-2.6.13-ck7/include/linux/sysctl.h	2005-10-05 21:54:28.000000000 +1000
@@ -184,6 +184,7 @@ enum
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_HARDMAPLIMIT=29,	/* Make mapped a hard limit */
+	VM_SWAP_PREFETCH=30,	/* int: amount to swap prefetch */
 };
 
 
Index: linux-2.6.13-ck7/init/Kconfig
===================================================================
--- linux-2.6.13-ck7.orig/init/Kconfig	2005-10-05 21:54:27.000000000 +1000
+++ linux-2.6.13-ck7/init/Kconfig	2005-10-05 21:54:28.000000000 +1000
@@ -89,7 +89,7 @@ config SWAP
 
 config SWAP_PREFETCH
 	bool "Support for prefetching swapped memory"
-	depends on SWAP && EXPERIMENTAL
+	depends on SWAP
 	default n
 	---help---
 	  This option will allow the kernel to prefetch swapped memory pages
Index: linux-2.6.13-ck7/kernel/sysctl.c
===================================================================
--- linux-2.6.13-ck7.orig/kernel/sysctl.c	2005-10-05 21:54:18.000000000 +1000
+++ linux-2.6.13-ck7/kernel/sysctl.c	2005-10-05 21:54:28.000000000 +1000
@@ -884,6 +884,18 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies,
 	},
+#ifdef CONFIG_SWAP_PREFETCH
+	{
+		.ctl_name	= VM_SWAP_PREFETCH,
+		.procname	= "swap_prefetch",
+		.data		= &swap_prefetch,
+		.maxlen		= sizeof(swap_prefetch),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#endif
 #endif
 	{ .ctl_name = 0 }
 };
Index: linux-2.6.13-ck7/mm/swap_prefetch.c
===================================================================
--- linux-2.6.13-ck7.orig/mm/swap_prefetch.c	2005-10-05 21:54:27.000000000 +1000
+++ linux-2.6.13-ck7/mm/swap_prefetch.c	2005-10-05 21:54:28.000000000 +1000
@@ -13,18 +13,23 @@
 #include <linux/swap.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/ioprio.h>
 
 /* Time to delay prefetching if vm is busy or prefetching unsuccessful */
 #define PREFETCH_DELAY	(HZ * 5)
 /* Time between attempting prefetching when vm is idle */
 #define PREFETCH_INTERVAL (HZ)
 
+int swap_prefetch = 2;	/* sysctl - if/how much to prefetch at a time */
+
 struct swapped_root_t {
-	unsigned long		busy;
-	spinlock_t		lock;
-	struct list_head	list;
-	unsigned int		count;
-	unsigned int		maxcount;
+	unsigned long		busy;		/* vm busy */
+	spinlock_t		lock;		/* protects all data */
+	struct list_head	list;		/* MRU list of swapped pages */
+	struct radix_tree_root	swap_tree;	/* Lookup tree of pages */
+	unsigned int		count;		/* Number of entries */
+	unsigned int		maxcount;	/* Maximum entries allowed */
 	kmem_cache_t		*cache;
 };
 
@@ -33,37 +38,39 @@ struct swapped_entry_t {
 	struct list_head	swapped_list;
 };
 
-static struct swapped_root_t swapped_root = {
-	.list  = LIST_HEAD_INIT(swapped_root.list),
-	.count = 0,
+static struct swapped_root_t swapped = {
+	.busy 		= 0,
+	.list  		= LIST_HEAD_INIT(swapped.list),
+	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
+	.count 		= 0,
 };
 
 static struct timer_list prefetch_timer;
 
 static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait);
 
-static unsigned long mapped_limit;
+static unsigned long mapped_limit;	/* Max mapped we will prefetch to */
+static unsigned long last_free = 0;	/* Last total free pages */
+static unsigned long temp_free = 0;
 
 /*
  * Create kmem cache for swapped entries
  */
-void prepare_prefetch(void)
+void __init prepare_prefetch(void)
 {
 	long total_memory = nr_free_pagecache_pages();
-	long se_size = sizeof(struct swapped_entry_t);
 
-	swapped_root.cache = kmem_cache_create("swapped_entry", se_size,
-		0, 0, NULL, NULL);
-	if (unlikely(!swapped_root.cache))
+	swapped.cache = kmem_cache_create("swapped_entry",
+		sizeof(struct swapped_entry_t), 0, 0, NULL, NULL);
+	if (unlikely(!swapped.cache))
 		panic("prepare_prefetch(): cannot create swapped_entry SLAB cache");
 
-	/* Set max count of swapped entries to 5% ram */
-	swapped_root.maxcount = (total_memory / 20) * (PAGE_SIZE / se_size);
+	/* Set max number of entries to size of physical ram */
+	swapped.maxcount = total_memory;
 	/* Set maximum amount of mapped pages to prefetch to 2/3 ram */
 	mapped_limit = total_memory / 3 * 2;
 
-	spin_lock_init(&swapped_root.lock);
-	swapped_root.busy = 0;
+	spin_lock_init(&swapped.lock);
 }
 
 static inline void delay_prefetch_timer(void)
@@ -82,7 +89,7 @@ static inline void reset_prefetch_timer(
  */
 void delay_prefetch(void)
 {
-	__set_bit(0, &swapped_root.busy);
+	__set_bit(0, &swapped.busy);
 }
 
 /*
@@ -94,21 +101,19 @@ void delay_prefetch(void)
 void add_to_swapped_list(unsigned long index)
 {
 	struct swapped_entry_t *entry;
-	struct address_space *mapping = &swapper_space;
-	unsigned long flags;
 	int error;
 
-	if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags)))
+	if (unlikely(!spin_trylock(&swapped.lock)))
 		goto out;
 
-	if (swapped_root.count >= swapped_root.maxcount) {
-		entry = list_entry(swapped_root.list.next,
+	if (swapped.count >= swapped.maxcount) {
+		entry = list_entry(swapped.list.next,
 				struct swapped_entry_t, swapped_list);
-		radix_tree_delete(&mapping->swap_tree, entry->swp_entry.val);
+		radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val);
 		list_del(&entry->swapped_list);
-		swapped_root.count--;
+		swapped.count--;
 	} else {
-		entry = kmem_cache_alloc(swapped_root.cache, GFP_ATOMIC);
+		entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC);
 		if (unlikely(!entry))
 			/* bad, can't allocate more mem */
 			goto out_locked;
@@ -118,23 +123,23 @@ void add_to_swapped_list(unsigned long i
 
 	error = radix_tree_preload(GFP_ATOMIC);
 	if (likely(!error)) {
-		error = radix_tree_insert(&mapping->swap_tree, index, entry);
+		error = radix_tree_insert(&swapped.swap_tree, index, entry);
 		if (likely(!error)) {
 			/*
 			 * If this is the first entry the timer needs to be
 			 * (re)started
 			 */
-			if (list_empty(&swapped_root.list))
+			if (list_empty(&swapped.list))
 				delay_prefetch_timer();
-			list_add(&entry->swapped_list, &swapped_root.list);
-			swapped_root.count++;
+			list_add(&entry->swapped_list, &swapped.list);
+			swapped.count++;
 		}
 		radix_tree_preload_end();
 	} else
-		kmem_cache_free(swapped_root.cache, entry);
+		kmem_cache_free(swapped.cache, entry);
 
 out_locked:
-	spin_unlock_irqrestore(&swapped_root.lock, flags);
+	spin_unlock(&swapped.lock);
 out:
 	return;
 }
@@ -145,19 +150,18 @@ out:
  */
 void remove_from_swapped_list(unsigned long index)
 {
-	struct address_space *mapping = &swapper_space;
 	struct swapped_entry_t *entry;
 	unsigned long flags;
 
-	if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags)))
+	if (unlikely(!spin_trylock_irqsave(&swapped.lock, flags)))
 		return;
-	entry = radix_tree_delete(&mapping->swap_tree, index);
-	if (entry) {
+	entry = radix_tree_delete(&swapped.swap_tree, index);
+	if (likely(entry)) {
 		list_del_init(&entry->swapped_list);
-		swapped_root.count--;
-		kmem_cache_free(swapped_root.cache, entry);
+		swapped.count--;
+		kmem_cache_free(swapped.cache, entry);
 	}
-	spin_unlock_irqrestore(&swapped_root.lock, flags);
+	spin_unlock_irqrestore(&swapped.lock, flags);
 }
 
 /*
@@ -165,7 +169,7 @@ void remove_from_swapped_list(unsigned l
  * then directly allocate the ram. We don't want prefetch to use
  * __alloc_pages and go calling on reclaim.
  */
-static struct page * prefetch_get_page(void)
+static struct page *prefetch_get_page(void)
 {
 	struct zone *zone = NULL, *z;
 	struct page *page = NULL;
@@ -179,18 +183,10 @@ static struct page * prefetch_get_page(v
 
 		free = z->free_pages;
 
-		/* Check yet again we are above watermarks, by now likely */
-		if (unlikely(free < z->pages_high * 3))
-			goto out;
-
 		/* We don't prefetch into DMA */
 		if (zone_idx(z) == ZONE_DMA)
 			continue;
 
-		/* Reasonably stressed zone, bypass it */
-		if (z->prev_priority < DEF_PRIORITY / 2)
-			continue;
-
 		/* Select the zone with the most free ram */
 		if (free > most_free) {
 			most_free = free;
@@ -207,7 +203,7 @@ static struct page * prefetch_get_page(v
 
 		zonelist = NODE_DATA(numa_node_id())->node_zonelists +
 		(GFP_HIGHUSER & GFP_ZONEMASK);
-;
+
 		zone_statistics(zonelist, zone);
 	}
 out:
@@ -216,21 +212,21 @@ out:
 
 /*
  * This tries to read a swp_entry_t into swap cache for swap prefetching.
+ * Returns 1 on success, 0 on failure, -1 on failure and we should delay
+ * further prefetching.
  */
 static int trickle_swap_cache_async(swp_entry_t entry)
 {
 	struct page *page = NULL;
-	struct address_space *mapping = &swapper_space;
-	unsigned long flags;
+	int ret = 0;
 
-	/* Entry may already exist */
-	local_irq_save(flags);
-	if (unlikely(!read_trylock(&mapping->tree_lock))) {
-		local_irq_restore(flags);
-		goto out_delay;
+	if (unlikely(!read_trylock(&swapper_space.tree_lock))) {
+		ret = -1;
+		goto out;
 	}
-	page = radix_tree_lookup(&mapping->page_tree, entry.val);
-	read_unlock_irqrestore(&mapping->tree_lock, flags);
+	/* Entry may already exist */
+	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+	read_unlock(&swapper_space.tree_lock);
 	if (page) {
 		remove_from_swapped_list(entry.val);
 		goto out;
@@ -238,22 +234,26 @@ static int trickle_swap_cache_async(swp_
 
 	/* Get a new page to read from swap */
 	page = prefetch_get_page();
-	if (unlikely(!page))
-		goto out_delay;
+	if (unlikely(!page)) {
+		ret = -1;
+		goto out;
+	}
 
-	if (add_to_swap_cache(page, entry)) {
+	if (add_to_swap_cache(page, entry))
 		/* Failed to add to swap cache */
-		page_cache_release(page);
-		goto out;
+		goto out_release;
+
+	if (unlikely(swap_readpage(NULL, page))) {
+		ret = -1;
+		goto out_release;
 	}
 
-	lru_cache_add_active(page);
-	swap_readpage(NULL, page);
-	return 1;
-out_delay:
-	return -1;
+	ret = 1;
+
+out_release:
+	page_cache_release(page);
 out:
-	return 0;
+	return ret;
 }
 
 /*
@@ -264,97 +264,113 @@ static int prefetch_suitable(void)
 	struct page_state ps;
 	unsigned long pending_writes, limit;
 	struct zone *z;
+	int ret = 0;
 
 	/* Purposefully racy and might return false positive which is ok */
-	if (__test_and_clear_bit(0, &swapped_root.busy))
-		goto out_delay;
+	if (__test_and_clear_bit(0, &swapped.busy))
+		goto out;
 
+	temp_free = 0;
 	/*
 	 * Have some hysteresis between where page reclaiming and prefetching
 	 * will occur to prevent ping-ponging between them.
 	 */
 	for_each_zone(z) {
+		unsigned long free;
+
 		if (z->present_pages == 0)
 			continue;
-		if (z->pages_high * 3 > z->free_pages)
-			goto out_delay;
+		free = z->free_pages;
+		if (z->pages_high * 3 > free)
+			goto out;
+		temp_free += free;
 	}
 
+	/*
+	 * We check to see that pages are not being allocated elsewhere
+	 * at any significant rate implying any degree of memory pressure
+	 * (eg during file reads)
+	 */
+	if (last_free) {
+		if (temp_free + SWAP_CLUSTER_MAX * (swap_prefetch + 1) <
+			last_free) {
+				last_free = temp_free;
+				goto out;
+		}
+	} else
+		last_free = temp_free;
+
 	get_page_state(&ps);
 
 	/* We shouldn't prefetch when we are doing writeback */
 	if (ps.nr_writeback)
-		goto out_delay;
+		goto out;
 
 	/* Delay prefetching if we have significant amounts of dirty data */
 	pending_writes = ps.nr_dirty + ps.nr_unstable;
 	if (pending_writes > SWAP_CLUSTER_MAX)
-		goto out_delay;
+		goto out;
 
 	/* >2/3 of the ram is mapped, we need some free for pagecache */
 	limit = ps.nr_mapped + ps.nr_slab + pending_writes;
 	if (limit > mapped_limit)
-		goto out_delay;
+		goto out;
 
 	/*
 	 * Add swapcache to limit as well, but check this last since it needs
 	 * locking
 	 */
 	if (unlikely(!read_trylock(&swapper_space.tree_lock)))
-		goto out_delay;
+		goto out;
 	limit += total_swapcache_pages;
 	read_unlock(&swapper_space.tree_lock);
 	if (limit > mapped_limit)
-		goto out_delay;
+		goto out;
 
 	/* Survived all that? Hooray we can prefetch! */
-	return 1;
-
-out_delay:
-	return 0;
+	ret = 1;
+out:
+	return ret;
 }
 
 /*
  * trickle_swap is the main function that initiates the swap prefetching. It
  * first checks to see if the busy flag is set, and does not prefetch if it
  * is, as the flag implied we are low on memory or swapping in currently.
- * Otherwise it runs till SWAP_CLUSTER_MAX is prefetched. This function
- * returns 1 if it succeeds in a cycle of prefetching, 0 if it is interrupted
- * or -1 if there is nothing left to prefetch.
+ * Otherwise it runs till SWAP_CLUSTER_MAX * swap_prefetch is prefetched.
+ * This function returns 1 if it succeeds in a cycle of prefetching, 0 if it
+ * is interrupted or -1 if there is nothing left to prefetch.
  */
 static int trickle_swap(void)
 {
 	int ret = 0, pages = 0;
 	struct swapped_entry_t *entry;
 
-	while (pages < SWAP_CLUSTER_MAX) {
+	while (pages < SWAP_CLUSTER_MAX * swap_prefetch) {
 		int got_page;
 
 		if (!prefetch_suitable())
 			goto out;
 		/* Lock is held? We must be busy elsewhere */
-		if (unlikely(!spin_trylock(&swapped_root.lock)))
+		if (unlikely(!spin_trylock(&swapped.lock)))
 			goto out;
-		if (list_empty(&swapped_root.list)) {
+		if (list_empty(&swapped.list)) {
 			ret = -1;
 			goto out_unlock;
 		}
-		entry = list_entry(swapped_root.list.next,
+		entry = list_entry(swapped.list.next,
 			struct swapped_entry_t, swapped_list);
-		spin_unlock(&swapped_root.lock);
+		spin_unlock(&swapped.lock);
 
 		got_page = trickle_swap_cache_async(entry->swp_entry);
-		if (unlikely(got_page == -1)) {
-			ret = -1;
-			goto out_unlock;
-		}
+		if (unlikely(got_page == -1))
+			goto out;
 		pages += got_page;
 	}
-	ret = 1;
-	goto out;
+	return 1;
 
 out_unlock:
-	spin_unlock(&swapped_root.lock);
+	spin_unlock(&swapped.lock);
 out:
 	return ret;
 }
@@ -365,6 +381,8 @@ static int kprefetchd(void *data)
 
 	daemonize("kprefetchd");
 	set_user_nice(current, 19);
+	/* Set ioprio to lowest if supported by i/o scheduler */
+	sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE);
 
 	for ( ; ; ) {
 		int prefetched;
@@ -375,10 +393,15 @@ static int kprefetchd(void *data)
 		finish_wait(&kprefetchd_wait, &wait);
 
 		/* If trickle_swap() returns -1 the timer is not reset */
-		if (!(prefetched = trickle_swap()))
-			delay_prefetch_timer();
-		else if (prefetched == 1)
+		prefetched = trickle_swap();
+		if (prefetched == 1) {
+			last_free = temp_free;
 			reset_prefetch_timer();
+		} else {
+			last_free = 0;
+			if (!prefetched)
+				delay_prefetch_timer();
+		}
 	}
 	return 0;
 }
Index: linux-2.6.13-ck7/mm/swap_state.c
===================================================================
--- linux-2.6.13-ck7.orig/mm/swap_state.c	2005-10-05 21:54:26.000000000 +1000
+++ linux-2.6.13-ck7/mm/swap_state.c	2005-10-05 21:54:28.000000000 +1000
@@ -36,8 +36,6 @@ static struct backing_dev_info swap_back
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
 	.tree_lock	= RW_LOCK_UNLOCKED,
-	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
-	.swapped_pages	= LIST_HEAD_INIT(swapper_space.swapped_pages),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
