Index: linux-2.6.13-ck5/include/linux/swap.h
===================================================================
--- linux-2.6.13-ck5.orig/include/linux/swap.h	2005-09-17 12:14:37.000000000 +1000
+++ linux-2.6.13-ck5/include/linux/swap.h	2005-09-20 23:11:13.000000000 +1000
@@ -187,11 +187,16 @@ extern int shmem_unuse(swp_entry_t entry
 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 
 #ifdef CONFIG_SWAP_PREFETCH
+/* only used by prefetch externally */
 /*	mm/swap_prefetch.c */
 extern void prepare_prefetch(void);
 extern void add_to_swapped_list(unsigned long index);
 extern void remove_from_swapped_list(unsigned long index);
 extern void delay_prefetch(void);
+/* linux/mm/page_alloc.c */
+extern struct page *
+buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags);
+extern void zone_statistics(struct zonelist *zonelist, struct zone *z);
 
 #else	/* CONFIG_SWAP_PREFETCH */
 static inline void add_to_swapped_list(unsigned long index)
Index: linux-2.6.13-ck5/mm/page_alloc.c
===================================================================
--- linux-2.6.13-ck5.orig/mm/page_alloc.c	2005-09-17 12:14:36.000000000 +1000
+++ linux-2.6.13-ck5/mm/page_alloc.c	2005-09-20 23:10:54.000000000 +1000
@@ -607,7 +607,7 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+void zone_statistics(struct zonelist *zonelist, struct zone *z)
 {
 #ifdef CONFIG_NUMA
 	unsigned long flags;
@@ -684,7 +684,7 @@ static inline void prep_zero_page(struct
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *
+struct page *
 buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
 {
 	unsigned long flags;
Index: linux-2.6.13-ck5/mm/swap_prefetch.c
===================================================================
--- linux-2.6.13-ck5.orig/mm/swap_prefetch.c	2005-09-17 12:14:38.000000000 +1000
+++ linux-2.6.13-ck5/mm/swap_prefetch.c	2005-09-23 16:56:00.000000000 +1000
@@ -20,13 +20,12 @@
 #define PREFETCH_INTERVAL (HZ)
 
 struct swapped_root_t {
+	unsigned long		busy;
 	spinlock_t		lock;
+	struct list_head	list;
 	unsigned int		count;
 	unsigned int		maxcount;
 	kmem_cache_t		*cache;
-	struct list_head	list;
-	int 			busy;
-	spinlock_t		busylock;
 };
 
 struct swapped_entry_t {
@@ -35,30 +34,36 @@ struct swapped_entry_t {
 };
 
 static struct swapped_root_t swapped_root = {
-	.count = 0,
 	.list  = LIST_HEAD_INIT(swapped_root.list),
+	.count = 0,
 };
 
 static struct timer_list prefetch_timer;
 
 static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait);
 
+static unsigned long mapped_limit;
+
 /*
  * Create kmem cache for swapped entries
  */
 void prepare_prefetch(void)
 {
-	swapped_root.cache = kmem_cache_create("swapped_entry",
-		sizeof(struct swapped_entry_t), 0, 0, NULL, NULL);
+	long total_memory = nr_free_pagecache_pages();
+	long se_size = sizeof(struct swapped_entry_t);
+
+	swapped_root.cache = kmem_cache_create("swapped_entry", se_size,
+		0, 0, NULL, NULL);
 	if (unlikely(!swapped_root.cache))
 		panic("prepare_prefetch(): cannot create swapped_entry SLAB cache");
 
-	/*
-	 * Set max count of swapped entries
-	 */
-	swapped_root.maxcount = nr_free_pagecache_pages();
+	/* Set max count of swapped entries to 5% ram */
+	swapped_root.maxcount = (total_memory / 20) * (PAGE_SIZE / se_size);
+	/* Set maximum amount of mapped pages to prefetch to 2/3 ram */
+	mapped_limit = total_memory / 3 * 2;
+
 	spin_lock_init(&swapped_root.lock);
-	spin_lock_init(&swapped_root.busylock);
+	swapped_root.busy = 0;
 }
 
 static inline void delay_prefetch_timer(void)
@@ -73,17 +78,19 @@ static inline void reset_prefetch_timer(
 
 /*
  * We check to see no part of the vm is busy. If it is this will interrupt
- * trickle_swap and wait another PREFETCH_DELAY
+ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy.
  */
 void delay_prefetch(void)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&swapped_root.busylock, flags);
-	swapped_root.busy = 1;
-	spin_unlock_irqrestore(&swapped_root.busylock, flags);
+	__set_bit(0, &swapped_root.busy);
 }
 
+/*
+ * Accounting is sloppy on purpose. As adding and removing entries from the
+ * list happens during swapping in and out we don't want to be spinning on
+ * locks. It is cheaper to just miss adding an entry since having a reference
+ * to every entry is not critical.
+ */
 void add_to_swapped_list(unsigned long index)
 {
 	struct swapped_entry_t *entry;
@@ -91,15 +98,6 @@ void add_to_swapped_list(unsigned long i
 	unsigned long flags;
 	int error;
 
-	/* Adding to the list? We must be busy */
-	delay_prefetch();
-
-	/*
-	 * It is not critical to add every entry to the swapped list and
-	 * since we're adding to the swapped list when we're swapping
-	 * out it is not a good time to be spinning to acquire the lock so
-	 * just don't add this entry to the list.
-	 */
 	if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags)))
 		goto out;
 
@@ -141,13 +139,18 @@ out:
 	return;
 }
 
+/*
+ * Cheaper to not spin on the lock and remove the entry lazily via
+ * add_to_swap_cache when we hit it in trickle_swap_cache_async
+ */
 void remove_from_swapped_list(unsigned long index)
 {
 	struct address_space *mapping = &swapper_space;
 	struct swapped_entry_t *entry;
 	unsigned long flags;
 
-	spin_lock_irqsave(&swapped_root.lock, flags);
+	if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags)))
+		return;
 	entry = radix_tree_delete(&mapping->swap_tree, index);
 	if (entry) {
 		list_del_init(&entry->swapped_list);
@@ -158,69 +161,114 @@ void remove_from_swapped_list(unsigned l
 }
 
 /*
+ * Find the zone with the most free pages, recheck the watermarks and
+ * then directly allocate the ram. We don't want prefetch to use
+ * __alloc_pages and go calling on reclaim.
+ */
+static struct page * prefetch_get_page(void)
+{
+	struct zone *zone = NULL, *z;
+	struct page *page = NULL;
+	long most_free = 0;
+
+	for_each_zone(z) {
+		long free;
+
+		if (z->present_pages == 0)
+			continue;
+
+		free = z->free_pages;
+
+		/* Check yet again we are above watermarks, by now likely */
+		if (unlikely(free < z->pages_high * 3))
+			goto out;
+
+		/* We don't prefetch into DMA */
+		if (zone_idx(z) == ZONE_DMA)
+			continue;
+
+		/* Reasonably stressed zone, bypass it */
+		if (z->prev_priority < DEF_PRIORITY / 2)
+			continue;
+
+		/* Select the zone with the most free ram */
+		if (free > most_free) {
+			most_free = free;
+			zone = z;
+		}
+	}
+
+	if (zone == NULL)
+		goto out;
+
+	page = buffered_rmqueue(zone, 0, GFP_HIGHUSER);
+	if (likely(page)) {
+		struct zonelist *zonelist;
+
+		zonelist = NODE_DATA(numa_node_id())->node_zonelists +
+		(GFP_HIGHUSER & GFP_ZONEMASK);
+;
+		zone_statistics(zonelist, zone);
+	}
+out:
+	return page;
+}
+
+/*
  * This tries to read a swp_entry_t into swap cache for swap prefetching.
  */
 static int trickle_swap_cache_async(swp_entry_t entry)
 {
-	struct page *found_page, *new_page = NULL;
+	struct page *page = NULL;
 	struct address_space *mapping = &swapper_space;
+	unsigned long flags;
 
-	/* May already exist, check it as cheaply as possible */
-	read_lock_irq(&mapping->tree_lock);
-	found_page = radix_tree_lookup(&mapping->page_tree, entry.val);
-	read_unlock_irq(&mapping->tree_lock);
-	if (found_page) {
+	/* Entry may already exist */
+	local_irq_save(flags);
+	if (unlikely(!read_trylock(&mapping->tree_lock))) {
+		local_irq_restore(flags);
+		goto out_delay;
+	}
+	page = radix_tree_lookup(&mapping->page_tree, entry.val);
+	read_unlock_irqrestore(&mapping->tree_lock, flags);
+	if (page) {
 		remove_from_swapped_list(entry.val);
 		goto out;
 	}
 
 	/* Get a new page to read from swap */
-	new_page = alloc_page_vma(GFP_HIGHUSER, NULL, 0);
-	if (unlikely(!new_page)) {
-		/* Bad - out of memory */
-		delay_prefetch();
-		goto out;
-	}
+	page = prefetch_get_page();
+	if (unlikely(!page))
+		goto out_delay;
 
-	if (add_to_swap_cache(new_page, entry)) {
+	if (add_to_swap_cache(page, entry)) {
 		/* Failed to add to swap cache */
-		page_cache_release(new_page);
+		page_cache_release(page);
 		goto out;
 	}
 
-	lru_cache_add_active(new_page);
-	swap_readpage(NULL, new_page);
+	lru_cache_add_active(page);
+	swap_readpage(NULL, page);
 	return 1;
+out_delay:
+	return -1;
 out:
 	return 0;
 }
 
-static int test_clear_busy(void)
-{
-	int ret;
-
-	/* Lock is held? We must be busy */
-	if (unlikely(!spin_trylock(&swapped_root.busylock))) {
-		ret = 1;
-		goto out;
-	}
-	ret = swapped_root.busy;
-	swapped_root.busy = 0;
-	spin_unlock(&swapped_root.busylock);
-out:
-	return ret;
-}
-
 /*
  * We want to be absolutely certain it's ok to start prefetching.
  */
 static int prefetch_suitable(void)
 {
-	unsigned long pending_writes;
+	struct page_state ps;
+	unsigned long pending_writes, limit;
 	struct zone *z;
 
-	if (test_clear_busy())
+	/* Purposefully racy and might return false positive which is ok */
+	if (__test_and_clear_bit(0, &swapped_root.busy))
 		goto out_delay;
+
 	/*
 	 * Have some hysteresis between where page reclaiming and prefetching
 	 * will occur to prevent ping-ponging between them.
@@ -232,17 +280,34 @@ static int prefetch_suitable(void)
 			goto out_delay;
 	}
 
+	get_page_state(&ps);
+
 	/* We shouldn't prefetch when we are doing writeback */
-	if (read_page_state(nr_writeback))
+	if (ps.nr_writeback)
 		goto out_delay;
 
 	/* Delay prefetching if we have significant amounts of dirty data */
-	pending_writes = read_page_state(nr_dirty) +
-		read_page_state(nr_unstable);
+	pending_writes = ps.nr_dirty + ps.nr_unstable;
 	if (pending_writes > SWAP_CLUSTER_MAX)
 		goto out_delay;
 
-	/* Survived all that? Hooray! */
+	/* >2/3 of the ram is mapped, we need some free for pagecache */
+	limit = ps.nr_mapped + ps.nr_slab + pending_writes;
+	if (limit > mapped_limit)
+		goto out_delay;
+
+	/*
+	 * Add swapcache to limit as well, but check this last since it needs
+	 * locking
+	 */
+	if (unlikely(!read_trylock(&swapper_space.tree_lock)))
+		goto out_delay;
+	limit += total_swapcache_pages;
+	read_unlock(&swapper_space.tree_lock);
+	if (limit > mapped_limit)
+		goto out_delay;
+
+	/* Survived all that? Hooray we can prefetch! */
 	return 1;
 
 out_delay:
@@ -263,6 +328,8 @@ static int trickle_swap(void)
 	struct swapped_entry_t *entry;
 
 	while (pages < SWAP_CLUSTER_MAX) {
+		int got_page;
+
 		if (!prefetch_suitable())
 			goto out;
 		/* Lock is held? We must be busy elsewhere */
@@ -276,8 +343,12 @@ static int trickle_swap(void)
 			struct swapped_entry_t, swapped_list);
 		spin_unlock(&swapped_root.lock);
 
-		if (trickle_swap_cache_async(entry->swp_entry))
-			pages++;
+		got_page = trickle_swap_cache_async(entry->swp_entry);
+		if (unlikely(got_page == -1)) {
+			ret = -1;
+			goto out_unlock;
+		}
+		pages += got_page;
 	}
 	ret = 1;
 	goto out;
