Index: linux-2.6.13-ck5/include/linux/swap.h =================================================================== --- linux-2.6.13-ck5.orig/include/linux/swap.h 2005-09-17 12:14:37.000000000 +1000 +++ linux-2.6.13-ck5/include/linux/swap.h 2005-09-20 23:11:13.000000000 +1000 @@ -187,11 +187,16 @@ extern int shmem_unuse(swp_entry_t entry extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP_PREFETCH +/* only used by prefetch externally */ /* mm/swap_prefetch.c */ extern void prepare_prefetch(void); extern void add_to_swapped_list(unsigned long index); extern void remove_from_swapped_list(unsigned long index); extern void delay_prefetch(void); +/* linux/mm/page_alloc.c */ +extern struct page * +buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags); +extern void zone_statistics(struct zonelist *zonelist, struct zone *z); #else /* CONFIG_SWAP_PREFETCH */ static inline void add_to_swapped_list(unsigned long index) Index: linux-2.6.13-ck5/mm/page_alloc.c =================================================================== --- linux-2.6.13-ck5.orig/mm/page_alloc.c 2005-09-17 12:14:36.000000000 +1000 +++ linux-2.6.13-ck5/mm/page_alloc.c 2005-09-20 23:10:54.000000000 +1000 @@ -607,7 +607,7 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z) +void zone_statistics(struct zonelist *zonelist, struct zone *z) { #ifdef CONFIG_NUMA unsigned long flags; @@ -684,7 +684,7 @@ static inline void prep_zero_page(struct * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * +struct page * buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) { unsigned long flags; Index: linux-2.6.13-ck5/mm/swap_prefetch.c =================================================================== --- linux-2.6.13-ck5.orig/mm/swap_prefetch.c 2005-09-17 12:14:38.000000000 +1000 +++ linux-2.6.13-ck5/mm/swap_prefetch.c 2005-09-20 23:33:34.000000000 +1000 @@ -43,20 +43,26 @@ static struct timer_list prefetch_timer; static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait); +static unsigned long mapped_limit; + /* * Create kmem cache for swapped entries */ void prepare_prefetch(void) { - swapped_root.cache = kmem_cache_create("swapped_entry", - sizeof(struct swapped_entry_t), 0, 0, NULL, NULL); + long total_memory = nr_free_pagecache_pages(); + long se_size = sizeof(struct swapped_entry_t); + + swapped_root.cache = kmem_cache_create("swapped_entry", se_size, + 0, 0, NULL, NULL); if (unlikely(!swapped_root.cache)) panic("prepare_prefetch(): cannot create swapped_entry SLAB cache"); - /* - * Set max count of swapped entries - */ - swapped_root.maxcount = nr_free_pagecache_pages(); + /* Set max count of swapped entries to 5% ram */ + swapped_root.maxcount = (total_memory / 20) * (PAGE_SIZE / se_size); + /* Set maximum amount of mapped pages to prefetch to 2/3 ram */ + mapped_limit = total_memory / 3 * 2; + spin_lock_init(&swapped_root.lock); spin_lock_init(&swapped_root.busylock); } @@ -91,9 +97,6 @@ void add_to_swapped_list(unsigned long i unsigned long flags; int error; - /* Adding to the list? We must be busy */ - delay_prefetch(); - /* * It is not critical to add every entry to the swapped list and * since we're adding to the swapped list when we're swapping @@ -158,39 +161,68 @@ void remove_from_swapped_list(unsigned l } /* + * This is a very lightweight function to get a page to prefetch into. The + * watermarks should already have been checked prior to this and we don't + * want to start reclaiming so we shouldn't do this in __alloc_pages. + */ +static struct page * prefetch_get_page(void) +{ + struct zone *z; + struct page *page = NULL; + struct zonelist *zonelist; + + zonelist = NODE_DATA(numa_node_id())->node_zonelists + + (GFP_HIGHUSER & GFP_ZONEMASK); + + for_each_zone(z) { + if (z->name == "DMA" || z->present_pages == 0) + continue; + page = buffered_rmqueue(z, 0, GFP_HIGHUSER); + if (page) + zone_statistics(zonelist, z); + break; + } + return page; +} + +/* * This tries to read a swp_entry_t into swap cache for swap prefetching. */ static int trickle_swap_cache_async(swp_entry_t entry) { - struct page *found_page, *new_page = NULL; + struct page *page = NULL; struct address_space *mapping = &swapper_space; + unsigned long flags; - /* May already exist, check it as cheaply as possible */ - read_lock_irq(&mapping->tree_lock); - found_page = radix_tree_lookup(&mapping->page_tree, entry.val); - read_unlock_irq(&mapping->tree_lock); - if (found_page) { + /* Entry may already exist, check it as cheaply as possible */ + local_irq_save(flags); + if (unlikely(!read_trylock(&mapping->tree_lock))) { + local_irq_restore(flags); + goto out_delay; + } + page = radix_tree_lookup(&mapping->page_tree, entry.val); + read_unlock_irqrestore(&mapping->tree_lock, flags); + if (page) { remove_from_swapped_list(entry.val); goto out; } /* Get a new page to read from swap */ - new_page = alloc_page_vma(GFP_HIGHUSER, NULL, 0); - if (unlikely(!new_page)) { - /* Bad - out of memory */ - delay_prefetch(); - goto out; - } + page = prefetch_get_page(); + if (unlikely(!page)) + goto out_delay; - if (add_to_swap_cache(new_page, entry)) { + if (add_to_swap_cache(page, entry)) { /* Failed to add to swap cache */ - page_cache_release(new_page); + page_cache_release(page); goto out; } - lru_cache_add_active(new_page); - swap_readpage(NULL, new_page); + lru_cache_add_active(page); + swap_readpage(NULL, page); return 1; +out_delay: + return -1; out: return 0; } @@ -221,16 +253,6 @@ static int prefetch_suitable(void) if (test_clear_busy()) goto out_delay; - /* - * Have some hysteresis between where page reclaiming and prefetching - * will occur to prevent ping-ponging between them. - */ - for_each_zone(z) { - if (z->present_pages == 0) - continue; - if (z->pages_high * 3 > z->free_pages) - goto out_delay; - } /* We shouldn't prefetch when we are doing writeback */ if (read_page_state(nr_writeback)) @@ -242,7 +264,22 @@ static int prefetch_suitable(void) if (pending_writes > SWAP_CLUSTER_MAX) goto out_delay; - /* Survived all that? Hooray! */ + /* >2/3 of the ram is mapped, we need some free for pagecache */ + if (read_page_state(nr_mapped) > mapped_limit) + goto out_delay; + + /* + * Have some hysteresis between where page reclaiming and prefetching + * will occur to prevent ping-ponging between them. + */ + for_each_zone(z) { + if (z->present_pages == 0) + continue; + if (z->pages_high * 3 > z->free_pages) + goto out_delay; + } + + /* Survived all that? Hooray we can prefetch! */ return 1; out_delay: @@ -263,6 +300,8 @@ static int trickle_swap(void) struct swapped_entry_t *entry; while (pages < SWAP_CLUSTER_MAX) { + int got_page; + if (!prefetch_suitable()) goto out; /* Lock is held? We must be busy elsewhere */ @@ -276,8 +315,12 @@ static int trickle_swap(void) struct swapped_entry_t, swapped_list); spin_unlock(&swapped_root.lock); - if (trickle_swap_cache_async(entry->swp_entry)) - pages++; + got_page = trickle_swap_cache_async(entry->swp_entry); + if (unlikely(got_page == -1)) { + ret = -1; + goto out_unlock; + } + pages += got_page; } ret = 1; goto out;