Index: linux-2.6.13-ck5/include/linux/swap.h =================================================================== --- linux-2.6.13-ck5.orig/include/linux/swap.h 2005-09-17 12:14:37.000000000 +1000 +++ linux-2.6.13-ck5/include/linux/swap.h 2005-09-20 23:11:13.000000000 +1000 @@ -187,11 +187,16 @@ extern int shmem_unuse(swp_entry_t entry extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP_PREFETCH +/* only used by prefetch externally */ /* mm/swap_prefetch.c */ extern void prepare_prefetch(void); extern void add_to_swapped_list(unsigned long index); extern void remove_from_swapped_list(unsigned long index); extern void delay_prefetch(void); +/* linux/mm/page_alloc.c */ +extern struct page * +buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags); +extern void zone_statistics(struct zonelist *zonelist, struct zone *z); #else /* CONFIG_SWAP_PREFETCH */ static inline void add_to_swapped_list(unsigned long index) Index: linux-2.6.13-ck5/mm/page_alloc.c =================================================================== --- linux-2.6.13-ck5.orig/mm/page_alloc.c 2005-09-17 12:14:36.000000000 +1000 +++ linux-2.6.13-ck5/mm/page_alloc.c 2005-09-20 23:10:54.000000000 +1000 @@ -607,7 +607,7 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z) +void zone_statistics(struct zonelist *zonelist, struct zone *z) { #ifdef CONFIG_NUMA unsigned long flags; @@ -684,7 +684,7 @@ static inline void prep_zero_page(struct * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * +struct page * buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) { unsigned long flags; Index: linux-2.6.13-ck5/mm/swap_prefetch.c =================================================================== --- linux-2.6.13-ck5.orig/mm/swap_prefetch.c 2005-09-17 12:14:38.000000000 +1000 +++ linux-2.6.13-ck5/mm/swap_prefetch.c 2005-09-23 16:56:00.000000000 +1000 @@ -20,13 +20,12 @@ #define PREFETCH_INTERVAL (HZ) struct swapped_root_t { + unsigned long busy; spinlock_t lock; + struct list_head list; unsigned int count; unsigned int maxcount; kmem_cache_t *cache; - struct list_head list; - int busy; - spinlock_t busylock; }; struct swapped_entry_t { @@ -35,30 +34,36 @@ struct swapped_entry_t { }; static struct swapped_root_t swapped_root = { - .count = 0, .list = LIST_HEAD_INIT(swapped_root.list), + .count = 0, }; static struct timer_list prefetch_timer; static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait); +static unsigned long mapped_limit; + /* * Create kmem cache for swapped entries */ void prepare_prefetch(void) { - swapped_root.cache = kmem_cache_create("swapped_entry", - sizeof(struct swapped_entry_t), 0, 0, NULL, NULL); + long total_memory = nr_free_pagecache_pages(); + long se_size = sizeof(struct swapped_entry_t); + + swapped_root.cache = kmem_cache_create("swapped_entry", se_size, + 0, 0, NULL, NULL); if (unlikely(!swapped_root.cache)) panic("prepare_prefetch(): cannot create swapped_entry SLAB cache"); - /* - * Set max count of swapped entries - */ - swapped_root.maxcount = nr_free_pagecache_pages(); + /* Set max count of swapped entries to 5% ram */ + swapped_root.maxcount = (total_memory / 20) * (PAGE_SIZE / se_size); + /* Set maximum amount of mapped pages to prefetch to 2/3 ram */ + mapped_limit = total_memory / 3 * 2; + spin_lock_init(&swapped_root.lock); - spin_lock_init(&swapped_root.busylock); + swapped_root.busy = 0; } static inline void delay_prefetch_timer(void) @@ -73,17 +78,19 @@ static inline void reset_prefetch_timer( /* * We check to see no part of the vm is busy. If it is this will interrupt - * trickle_swap and wait another PREFETCH_DELAY + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. */ void delay_prefetch(void) { - unsigned long flags; - - spin_lock_irqsave(&swapped_root.busylock, flags); - swapped_root.busy = 1; - spin_unlock_irqrestore(&swapped_root.busylock, flags); + __set_bit(0, &swapped_root.busy); } +/* + * Accounting is sloppy on purpose. As adding and removing entries from the + * list happens during swapping in and out we don't want to be spinning on + * locks. It is cheaper to just miss adding an entry since having a reference + * to every entry is not critical. + */ void add_to_swapped_list(unsigned long index) { struct swapped_entry_t *entry; @@ -91,15 +98,6 @@ void add_to_swapped_list(unsigned long i unsigned long flags; int error; - /* Adding to the list? We must be busy */ - delay_prefetch(); - - /* - * It is not critical to add every entry to the swapped list and - * since we're adding to the swapped list when we're swapping - * out it is not a good time to be spinning to acquire the lock so - * just don't add this entry to the list. - */ if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags))) goto out; @@ -141,13 +139,18 @@ out: return; } +/* + * Cheaper to not spin on the lock and remove the entry lazily via + * add_to_swap_cache when we hit it in trickle_swap_cache_async + */ void remove_from_swapped_list(unsigned long index) { struct address_space *mapping = &swapper_space; struct swapped_entry_t *entry; unsigned long flags; - spin_lock_irqsave(&swapped_root.lock, flags); + if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags))) + return; entry = radix_tree_delete(&mapping->swap_tree, index); if (entry) { list_del_init(&entry->swapped_list); @@ -158,69 +161,114 @@ void remove_from_swapped_list(unsigned l } /* + * Find the zone with the most free pages, recheck the watermarks and + * then directly allocate the ram. We don't want prefetch to use + * __alloc_pages and go calling on reclaim. + */ +static struct page * prefetch_get_page(void) +{ + struct zone *zone = NULL, *z; + struct page *page = NULL; + long most_free = 0; + + for_each_zone(z) { + long free; + + if (z->present_pages == 0) + continue; + + free = z->free_pages; + + /* Check yet again we are above watermarks, by now likely */ + if (unlikely(free < z->pages_high * 3)) + goto out; + + /* We don't prefetch into DMA */ + if (zone_idx(z) == ZONE_DMA) + continue; + + /* Reasonably stressed zone, bypass it */ + if (z->prev_priority < DEF_PRIORITY / 2) + continue; + + /* Select the zone with the most free ram */ + if (free > most_free) { + most_free = free; + zone = z; + } + } + + if (zone == NULL) + goto out; + + page = buffered_rmqueue(zone, 0, GFP_HIGHUSER); + if (likely(page)) { + struct zonelist *zonelist; + + zonelist = NODE_DATA(numa_node_id())->node_zonelists + + (GFP_HIGHUSER & GFP_ZONEMASK); +; + zone_statistics(zonelist, zone); + } +out: + return page; +} + +/* * This tries to read a swp_entry_t into swap cache for swap prefetching. */ static int trickle_swap_cache_async(swp_entry_t entry) { - struct page *found_page, *new_page = NULL; + struct page *page = NULL; struct address_space *mapping = &swapper_space; + unsigned long flags; - /* May already exist, check it as cheaply as possible */ - read_lock_irq(&mapping->tree_lock); - found_page = radix_tree_lookup(&mapping->page_tree, entry.val); - read_unlock_irq(&mapping->tree_lock); - if (found_page) { + /* Entry may already exist */ + local_irq_save(flags); + if (unlikely(!read_trylock(&mapping->tree_lock))) { + local_irq_restore(flags); + goto out_delay; + } + page = radix_tree_lookup(&mapping->page_tree, entry.val); + read_unlock_irqrestore(&mapping->tree_lock, flags); + if (page) { remove_from_swapped_list(entry.val); goto out; } /* Get a new page to read from swap */ - new_page = alloc_page_vma(GFP_HIGHUSER, NULL, 0); - if (unlikely(!new_page)) { - /* Bad - out of memory */ - delay_prefetch(); - goto out; - } + page = prefetch_get_page(); + if (unlikely(!page)) + goto out_delay; - if (add_to_swap_cache(new_page, entry)) { + if (add_to_swap_cache(page, entry)) { /* Failed to add to swap cache */ - page_cache_release(new_page); + page_cache_release(page); goto out; } - lru_cache_add_active(new_page); - swap_readpage(NULL, new_page); + lru_cache_add_active(page); + swap_readpage(NULL, page); return 1; +out_delay: + return -1; out: return 0; } -static int test_clear_busy(void) -{ - int ret; - - /* Lock is held? We must be busy */ - if (unlikely(!spin_trylock(&swapped_root.busylock))) { - ret = 1; - goto out; - } - ret = swapped_root.busy; - swapped_root.busy = 0; - spin_unlock(&swapped_root.busylock); -out: - return ret; -} - /* * We want to be absolutely certain it's ok to start prefetching. */ static int prefetch_suitable(void) { - unsigned long pending_writes; + struct page_state ps; + unsigned long pending_writes, limit; struct zone *z; - if (test_clear_busy()) + /* Purposefully racy and might return false positive which is ok */ + if (__test_and_clear_bit(0, &swapped_root.busy)) goto out_delay; + /* * Have some hysteresis between where page reclaiming and prefetching * will occur to prevent ping-ponging between them. @@ -232,17 +280,34 @@ static int prefetch_suitable(void) goto out_delay; } + get_page_state(&ps); + /* We shouldn't prefetch when we are doing writeback */ - if (read_page_state(nr_writeback)) + if (ps.nr_writeback) goto out_delay; /* Delay prefetching if we have significant amounts of dirty data */ - pending_writes = read_page_state(nr_dirty) + - read_page_state(nr_unstable); + pending_writes = ps.nr_dirty + ps.nr_unstable; if (pending_writes > SWAP_CLUSTER_MAX) goto out_delay; - /* Survived all that? Hooray! */ + /* >2/3 of the ram is mapped, we need some free for pagecache */ + limit = ps.nr_mapped + ps.nr_slab + pending_writes; + if (limit > mapped_limit) + goto out_delay; + + /* + * Add swapcache to limit as well, but check this last since it needs + * locking + */ + if (unlikely(!read_trylock(&swapper_space.tree_lock))) + goto out_delay; + limit += total_swapcache_pages; + read_unlock(&swapper_space.tree_lock); + if (limit > mapped_limit) + goto out_delay; + + /* Survived all that? Hooray we can prefetch! */ return 1; out_delay: @@ -263,6 +328,8 @@ static int trickle_swap(void) struct swapped_entry_t *entry; while (pages < SWAP_CLUSTER_MAX) { + int got_page; + if (!prefetch_suitable()) goto out; /* Lock is held? We must be busy elsewhere */ @@ -276,8 +343,12 @@ static int trickle_swap(void) struct swapped_entry_t, swapped_list); spin_unlock(&swapped_root.lock); - if (trickle_swap_cache_async(entry->swp_entry)) - pages++; + got_page = trickle_swap_cache_async(entry->swp_entry); + if (unlikely(got_page == -1)) { + ret = -1; + goto out_unlock; + } + pages += got_page; } ret = 1; goto out;