Index: linux-2.6.13-ck5/include/linux/swap.h =================================================================== --- linux-2.6.13-ck5.orig/include/linux/swap.h 2005-09-17 12:14:37.000000000 +1000 +++ linux-2.6.13-ck5/include/linux/swap.h 2005-09-20 23:11:13.000000000 +1000 @@ -187,11 +187,16 @@ extern int shmem_unuse(swp_entry_t entry extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP_PREFETCH +/* only used by prefetch externally */ /* mm/swap_prefetch.c */ extern void prepare_prefetch(void); extern void add_to_swapped_list(unsigned long index); extern void remove_from_swapped_list(unsigned long index); extern void delay_prefetch(void); +/* linux/mm/page_alloc.c */ +extern struct page * +buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags); +extern void zone_statistics(struct zonelist *zonelist, struct zone *z); #else /* CONFIG_SWAP_PREFETCH */ static inline void add_to_swapped_list(unsigned long index) Index: linux-2.6.13-ck5/mm/page_alloc.c =================================================================== --- linux-2.6.13-ck5.orig/mm/page_alloc.c 2005-09-17 12:14:36.000000000 +1000 +++ linux-2.6.13-ck5/mm/page_alloc.c 2005-09-20 23:10:54.000000000 +1000 @@ -607,7 +607,7 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z) +void zone_statistics(struct zonelist *zonelist, struct zone *z) { #ifdef CONFIG_NUMA unsigned long flags; @@ -684,7 +684,7 @@ static inline void prep_zero_page(struct * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * +struct page * buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) { unsigned long flags; Index: linux-2.6.13-ck5/mm/swap_prefetch.c =================================================================== --- linux-2.6.13-ck5.orig/mm/swap_prefetch.c 2005-09-17 12:14:38.000000000 +1000 +++ linux-2.6.13-ck5/mm/swap_prefetch.c 2005-09-22 22:09:29.000000000 +1000 @@ -20,13 +20,13 @@ #define PREFETCH_INTERVAL (HZ) struct swapped_root_t { + int busy; + spinlock_t busylock; spinlock_t lock; + struct list_head list; unsigned int count; unsigned int maxcount; kmem_cache_t *cache; - struct list_head list; - int busy; - spinlock_t busylock; }; struct swapped_entry_t { @@ -35,28 +35,34 @@ struct swapped_entry_t { }; static struct swapped_root_t swapped_root = { - .count = 0, .list = LIST_HEAD_INIT(swapped_root.list), + .count = 0, }; static struct timer_list prefetch_timer; static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait); +static unsigned long mapped_limit; + /* * Create kmem cache for swapped entries */ void prepare_prefetch(void) { - swapped_root.cache = kmem_cache_create("swapped_entry", - sizeof(struct swapped_entry_t), 0, 0, NULL, NULL); + long total_memory = nr_free_pagecache_pages(); + long se_size = sizeof(struct swapped_entry_t); + + swapped_root.cache = kmem_cache_create("swapped_entry", se_size, + 0, 0, NULL, NULL); if (unlikely(!swapped_root.cache)) panic("prepare_prefetch(): cannot create swapped_entry SLAB cache"); - /* - * Set max count of swapped entries - */ - swapped_root.maxcount = nr_free_pagecache_pages(); + /* Set max count of swapped entries to 5% ram */ + swapped_root.maxcount = (total_memory / 20) * (PAGE_SIZE / se_size); + /* Set maximum amount of mapped pages to prefetch to 2/3 ram */ + mapped_limit = total_memory / 3 * 2; + spin_lock_init(&swapped_root.lock); spin_lock_init(&swapped_root.busylock); } @@ -91,9 +97,6 @@ void add_to_swapped_list(unsigned long i unsigned long flags; int error; - /* Adding to the list? We must be busy */ - delay_prefetch(); - /* * It is not critical to add every entry to the swapped list and * since we're adding to the swapped list when we're swapping @@ -158,39 +161,68 @@ void remove_from_swapped_list(unsigned l } /* + * This is a very lightweight function to get a page to prefetch into. The + * watermarks should already have been checked prior to this and we don't + * want to start reclaiming so we shouldn't do this in __alloc_pages. + */ +static struct page * prefetch_get_page(void) +{ + struct zone *z; + struct page *page = NULL; + struct zonelist *zonelist; + + zonelist = NODE_DATA(numa_node_id())->node_zonelists + + (GFP_HIGHUSER & GFP_ZONEMASK); + + for_each_zone(z) { + if (zone_idx(z) == ZONE_DMA || z->present_pages == 0) + continue; + page = buffered_rmqueue(z, 0, GFP_HIGHUSER); + if (page) + zone_statistics(zonelist, z); + break; + } + return page; +} + +/* * This tries to read a swp_entry_t into swap cache for swap prefetching. */ static int trickle_swap_cache_async(swp_entry_t entry) { - struct page *found_page, *new_page = NULL; + struct page *page = NULL; struct address_space *mapping = &swapper_space; + unsigned long flags; - /* May already exist, check it as cheaply as possible */ - read_lock_irq(&mapping->tree_lock); - found_page = radix_tree_lookup(&mapping->page_tree, entry.val); - read_unlock_irq(&mapping->tree_lock); - if (found_page) { + /* Entry may already exist, check it as cheaply as possible */ + local_irq_save(flags); + if (unlikely(!read_trylock(&mapping->tree_lock))) { + local_irq_restore(flags); + goto out_delay; + } + page = radix_tree_lookup(&mapping->page_tree, entry.val); + read_unlock_irqrestore(&mapping->tree_lock, flags); + if (page) { remove_from_swapped_list(entry.val); goto out; } /* Get a new page to read from swap */ - new_page = alloc_page_vma(GFP_HIGHUSER, NULL, 0); - if (unlikely(!new_page)) { - /* Bad - out of memory */ - delay_prefetch(); - goto out; - } + page = prefetch_get_page(); + if (unlikely(!page)) + goto out_delay; - if (add_to_swap_cache(new_page, entry)) { + if (add_to_swap_cache(page, entry)) { /* Failed to add to swap cache */ - page_cache_release(new_page); + page_cache_release(page); goto out; } - lru_cache_add_active(new_page); - swap_readpage(NULL, new_page); + lru_cache_add_active(page); + swap_readpage(NULL, page); return 1; +out_delay: + return -1; out: return 0; } @@ -216,11 +248,22 @@ out: */ static int prefetch_suitable(void) { - unsigned long pending_writes; + unsigned long pending_writes, limit; struct zone *z; if (test_clear_busy()) goto out_delay; + + /* We shouldn't prefetch when we are doing writeback */ + if (read_page_state(nr_writeback)) + goto out_delay; + + /* Delay prefetching if we have significant amounts of dirty data */ + pending_writes = read_page_state(nr_dirty) + + read_page_state(nr_unstable); + if (pending_writes > SWAP_CLUSTER_MAX) + goto out_delay; + /* * Have some hysteresis between where page reclaiming and prefetching * will occur to prevent ping-ponging between them. @@ -232,17 +275,23 @@ static int prefetch_suitable(void) goto out_delay; } - /* We shouldn't prefetch when we are doing writeback */ - if (read_page_state(nr_writeback)) + /* >2/3 of the ram is mapped, we need some free for pagecache */ + limit = read_page_state(nr_mapped); + if (limit > mapped_limit) goto out_delay; - /* Delay prefetching if we have significant amounts of dirty data */ - pending_writes = read_page_state(nr_dirty) + - read_page_state(nr_unstable); - if (pending_writes > SWAP_CLUSTER_MAX) + /* + * Add swapcache to limit as well, but check this last since it needs + * locking + */ + if (unlikely(!read_trylock(&swapper_space.tree_lock))) + goto out_delay; + limit += total_swapcache_pages; + read_unlock(&swapper_space.tree_lock); + if (limit > mapped_limit) goto out_delay; - /* Survived all that? Hooray! */ + /* Survived all that? Hooray we can prefetch! */ return 1; out_delay: @@ -263,6 +312,8 @@ static int trickle_swap(void) struct swapped_entry_t *entry; while (pages < SWAP_CLUSTER_MAX) { + int got_page; + if (!prefetch_suitable()) goto out; /* Lock is held? We must be busy elsewhere */ @@ -276,8 +327,12 @@ static int trickle_swap(void) struct swapped_entry_t, swapped_list); spin_unlock(&swapped_root.lock); - if (trickle_swap_cache_async(entry->swp_entry)) - pages++; + got_page = trickle_swap_cache_async(entry->swp_entry); + if (unlikely(got_page == -1)) { + ret = -1; + goto out_unlock; + } + pages += got_page; } ret = 1; goto out;