Index: linux-2.6.13-ck5/include/linux/swap.h
===================================================================
--- linux-2.6.13-ck5.orig/include/linux/swap.h	2005-09-17 12:14:37.000000000 +1000
+++ linux-2.6.13-ck5/include/linux/swap.h	2005-09-20 23:11:13.000000000 +1000
@@ -187,11 +187,16 @@ extern int shmem_unuse(swp_entry_t entry
 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 
 #ifdef CONFIG_SWAP_PREFETCH
+/* only used by prefetch externally */
 /*	mm/swap_prefetch.c */
 extern void prepare_prefetch(void);
 extern void add_to_swapped_list(unsigned long index);
 extern void remove_from_swapped_list(unsigned long index);
 extern void delay_prefetch(void);
+/* linux/mm/page_alloc.c */
+extern struct page *
+buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags);
+extern void zone_statistics(struct zonelist *zonelist, struct zone *z);
 
 #else	/* CONFIG_SWAP_PREFETCH */
 static inline void add_to_swapped_list(unsigned long index)
Index: linux-2.6.13-ck5/mm/page_alloc.c
===================================================================
--- linux-2.6.13-ck5.orig/mm/page_alloc.c	2005-09-17 12:14:36.000000000 +1000
+++ linux-2.6.13-ck5/mm/page_alloc.c	2005-09-20 23:10:54.000000000 +1000
@@ -607,7 +607,7 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+void zone_statistics(struct zonelist *zonelist, struct zone *z)
 {
 #ifdef CONFIG_NUMA
 	unsigned long flags;
@@ -684,7 +684,7 @@ static inline void prep_zero_page(struct
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *
+struct page *
 buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
 {
 	unsigned long flags;
Index: linux-2.6.13-ck5/mm/swap_prefetch.c
===================================================================
--- linux-2.6.13-ck5.orig/mm/swap_prefetch.c	2005-09-17 12:14:38.000000000 +1000
+++ linux-2.6.13-ck5/mm/swap_prefetch.c	2005-09-20 23:33:34.000000000 +1000
@@ -43,20 +43,26 @@ static struct timer_list prefetch_timer;
 
 static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait);
 
+static unsigned long mapped_limit;
+
 /*
  * Create kmem cache for swapped entries
  */
 void prepare_prefetch(void)
 {
-	swapped_root.cache = kmem_cache_create("swapped_entry",
-		sizeof(struct swapped_entry_t), 0, 0, NULL, NULL);
+	long total_memory = nr_free_pagecache_pages();
+	long se_size = sizeof(struct swapped_entry_t);
+
+	swapped_root.cache = kmem_cache_create("swapped_entry", se_size,
+		0, 0, NULL, NULL);
 	if (unlikely(!swapped_root.cache))
 		panic("prepare_prefetch(): cannot create swapped_entry SLAB cache");
 
-	/*
-	 * Set max count of swapped entries
-	 */
-	swapped_root.maxcount = nr_free_pagecache_pages();
+	/* Set max count of swapped entries to 5% ram */
+	swapped_root.maxcount = (total_memory / 20) * (PAGE_SIZE / se_size);
+	/* Set maximum amount of mapped pages to prefetch to 2/3 ram */
+	mapped_limit = total_memory / 3 * 2;
+
 	spin_lock_init(&swapped_root.lock);
 	spin_lock_init(&swapped_root.busylock);
 }
@@ -91,9 +97,6 @@ void add_to_swapped_list(unsigned long i
 	unsigned long flags;
 	int error;
 
-	/* Adding to the list? We must be busy */
-	delay_prefetch();
-
 	/*
 	 * It is not critical to add every entry to the swapped list and
 	 * since we're adding to the swapped list when we're swapping
@@ -158,39 +161,68 @@ void remove_from_swapped_list(unsigned l
 }
 
 /*
+ * This is a very lightweight function to get a page to prefetch into. The
+ * watermarks should already have been checked prior to this and we don't
+ * want to start reclaiming so we shouldn't do this in __alloc_pages.
+ */
+static struct page * prefetch_get_page(void)
+{
+	struct zone *z;
+	struct page *page = NULL;
+	struct zonelist *zonelist;
+
+	zonelist = NODE_DATA(numa_node_id())->node_zonelists +
+		(GFP_HIGHUSER & GFP_ZONEMASK);
+
+	for_each_zone(z) {
+		if (z->name == "DMA" || z->present_pages == 0)
+			continue;
+		page = buffered_rmqueue(z, 0, GFP_HIGHUSER);
+		if (page)
+			zone_statistics(zonelist, z);
+		break;
+	}
+	return page;
+}
+
+/*
  * This tries to read a swp_entry_t into swap cache for swap prefetching.
  */
 static int trickle_swap_cache_async(swp_entry_t entry)
 {
-	struct page *found_page, *new_page = NULL;
+	struct page *page = NULL;
 	struct address_space *mapping = &swapper_space;
+	unsigned long flags;
 
-	/* May already exist, check it as cheaply as possible */
-	read_lock_irq(&mapping->tree_lock);
-	found_page = radix_tree_lookup(&mapping->page_tree, entry.val);
-	read_unlock_irq(&mapping->tree_lock);
-	if (found_page) {
+	/* Entry may already exist, check it as cheaply as possible */
+	local_irq_save(flags);
+	if (unlikely(!read_trylock(&mapping->tree_lock))) {
+		local_irq_restore(flags);
+		goto out_delay;
+	}
+	page = radix_tree_lookup(&mapping->page_tree, entry.val);
+	read_unlock_irqrestore(&mapping->tree_lock, flags);
+	if (page) {
 		remove_from_swapped_list(entry.val);
 		goto out;
 	}
 
 	/* Get a new page to read from swap */
-	new_page = alloc_page_vma(GFP_HIGHUSER, NULL, 0);
-	if (unlikely(!new_page)) {
-		/* Bad - out of memory */
-		delay_prefetch();
-		goto out;
-	}
+	page = prefetch_get_page();
+	if (unlikely(!page))
+		goto out_delay;
 
-	if (add_to_swap_cache(new_page, entry)) {
+	if (add_to_swap_cache(page, entry)) {
 		/* Failed to add to swap cache */
-		page_cache_release(new_page);
+		page_cache_release(page);
 		goto out;
 	}
 
-	lru_cache_add_active(new_page);
-	swap_readpage(NULL, new_page);
+	lru_cache_add_active(page);
+	swap_readpage(NULL, page);
 	return 1;
+out_delay:
+	return -1;
 out:
 	return 0;
 }
@@ -221,16 +253,6 @@ static int prefetch_suitable(void)
 
 	if (test_clear_busy())
 		goto out_delay;
-	/*
-	 * Have some hysteresis between where page reclaiming and prefetching
-	 * will occur to prevent ping-ponging between them.
-	 */
-	for_each_zone(z) {
-		if (z->present_pages == 0)
-			continue;
-		if (z->pages_high * 3 > z->free_pages)
-			goto out_delay;
-	}
 
 	/* We shouldn't prefetch when we are doing writeback */
 	if (read_page_state(nr_writeback))
@@ -242,7 +264,22 @@ static int prefetch_suitable(void)
 	if (pending_writes > SWAP_CLUSTER_MAX)
 		goto out_delay;
 
-	/* Survived all that? Hooray! */
+	/* >2/3 of the ram is mapped, we need some free for pagecache */
+	if (read_page_state(nr_mapped) > mapped_limit)
+		goto out_delay;
+
+	/*
+	 * Have some hysteresis between where page reclaiming and prefetching
+	 * will occur to prevent ping-ponging between them.
+	 */
+	for_each_zone(z) {
+		if (z->present_pages == 0)
+			continue;
+		if (z->pages_high * 3 > z->free_pages)
+			goto out_delay;
+	}
+
+	/* Survived all that? Hooray we can prefetch! */
 	return 1;
 
 out_delay:
@@ -263,6 +300,8 @@ static int trickle_swap(void)
 	struct swapped_entry_t *entry;
 
 	while (pages < SWAP_CLUSTER_MAX) {
+		int got_page;
+
 		if (!prefetch_suitable())
 			goto out;
 		/* Lock is held? We must be busy elsewhere */
@@ -276,8 +315,12 @@ static int trickle_swap(void)
 			struct swapped_entry_t, swapped_list);
 		spin_unlock(&swapped_root.lock);
 
-		if (trickle_swap_cache_async(entry->swp_entry))
-			pages++;
+		got_page = trickle_swap_cache_async(entry->swp_entry);
+		if (unlikely(got_page == -1)) {
+			ret = -1;
+			goto out_unlock;
+		}
+		pages += got_page;
 	}
 	ret = 1;
 	goto out;
