Index: linux-2.6.14-rc2-ck1/mm/swap_prefetch.c =================================================================== --- linux-2.6.14-rc2-ck1.orig/mm/swap_prefetch.c 2005-09-23 00:22:02.000000000 +1000 +++ linux-2.6.14-rc2-ck1/mm/swap_prefetch.c 2005-09-23 16:17:35.000000000 +1000 @@ -20,8 +20,7 @@ #define PREFETCH_INTERVAL (HZ) struct swapped_root_t { - int busy; - spinlock_t busylock; + unsigned long busy; spinlock_t lock; struct list_head list; unsigned int count; @@ -64,7 +63,7 @@ void prepare_prefetch(void) mapped_limit = total_memory / 3 * 2; spin_lock_init(&swapped_root.lock); - spin_lock_init(&swapped_root.busylock); + swapped_root.busy = 0; } static inline void delay_prefetch_timer(void) @@ -79,17 +78,19 @@ static inline void reset_prefetch_timer( /* * We check to see no part of the vm is busy. If it is this will interrupt - * trickle_swap and wait another PREFETCH_DELAY + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. */ void delay_prefetch(void) { - unsigned long flags; - - spin_lock_irqsave(&swapped_root.busylock, flags); - swapped_root.busy = 1; - spin_unlock_irqrestore(&swapped_root.busylock, flags); + __set_bit(0, &swapped_root.busy); } +/* + * Accounting is sloppy on purpose. As adding and removing entries from the + * list happens during swapping in and out we don't want to be spinning on + * locks. It is cheaper to just miss adding an entry since having a reference + * to every entry is not critical. + */ void add_to_swapped_list(unsigned long index) { struct swapped_entry_t *entry; @@ -97,12 +98,6 @@ void add_to_swapped_list(unsigned long i unsigned long flags; int error; - /* - * It is not critical to add every entry to the swapped list and - * since we're adding to the swapped list when we're swapping - * out it is not a good time to be spinning to acquire the lock so - * just don't add this entry to the list. - */ if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags))) goto out; @@ -144,13 +139,18 @@ out: return; } +/* + * Cheaper to not spin on the lock and remove the entry lazily via + * add_to_swap_cache when we hit it in trickle_swap_cache_async + */ void remove_from_swapped_list(unsigned long index) { struct address_space *mapping = &swapper_space; struct swapped_entry_t *entry; unsigned long flags; - spin_lock_irqsave(&swapped_root.lock, flags); + if (unlikely(!spin_trylock_irqsave(&swapped_root.lock, flags))) + return; entry = radix_tree_delete(&mapping->swap_tree, index); if (entry) { list_del_init(&entry->swapped_list); @@ -161,27 +161,56 @@ void remove_from_swapped_list(unsigned l } /* - * This is a very lightweight function to get a page to prefetch into. The - * watermarks should already have been checked prior to this and we don't - * want to start reclaiming so we shouldn't do this in __alloc_pages. + * Find the zone with the most free pages, recheck the watermarks and + * then directly allocate the ram. We don't want prefetch to use + * __alloc_pages and go calling on reclaim. */ static struct page * prefetch_get_page(void) { - struct zone *z; + struct zone *zone = NULL, *z; struct page *page = NULL; - struct zonelist *zonelist; - - zonelist = NODE_DATA(numa_node_id())->node_zonelists + - (GFP_HIGHUSER & GFP_ZONEMASK); + long most_free = 0; for_each_zone(z) { - if (zone_idx(z) == ZONE_DMA || z->present_pages == 0) + long free; + + if (z->present_pages == 0) + continue; + + free = z->free_pages; + + /* Check yet again we are above watermarks, by now likely */ + if (unlikely(free < z->pages_high * 3)) + goto out; + + /* We don't prefetch into DMA */ + if (zone_idx(z) == ZONE_DMA) + continue; + + /* Reasonably stressed zone, bypass it */ + if (z->prev_priority < DEF_PRIORITY / 2) continue; - page = buffered_rmqueue(z, 0, GFP_HIGHUSER); - if (page) - zone_statistics(zonelist, z); - break; + + /* Select the zone with the most free ram */ + if (free > most_free) { + most_free = free; + zone = z; + } } + + if (zone == NULL) + goto out; + + page = buffered_rmqueue(zone, 0, GFP_HIGHUSER); + if (likely(page)) { + struct zonelist *zonelist; + + zonelist = NODE_DATA(numa_node_id())->node_zonelists + + (GFP_HIGHUSER & GFP_ZONEMASK); +; + zone_statistics(zonelist, zone); + } +out: return page; } @@ -194,7 +223,7 @@ static int trickle_swap_cache_async(swp_ struct address_space *mapping = &swapper_space; unsigned long flags; - /* Entry may already exist, check it as cheaply as possible */ + /* Entry may already exist */ local_irq_save(flags); if (unlikely(!read_trylock(&mapping->tree_lock))) { local_irq_restore(flags); @@ -227,41 +256,17 @@ out: return 0; } -static int test_clear_busy(void) -{ - int ret; - - /* Lock is held? We must be busy */ - if (unlikely(!spin_trylock(&swapped_root.busylock))) { - ret = 1; - goto out; - } - ret = swapped_root.busy; - swapped_root.busy = 0; - spin_unlock(&swapped_root.busylock); -out: - return ret; -} - /* * We want to be absolutely certain it's ok to start prefetching. */ static int prefetch_suitable(void) { + struct page_state ps; unsigned long pending_writes, limit; struct zone *z; - if (test_clear_busy()) - goto out_delay; - - /* We shouldn't prefetch when we are doing writeback */ - if (read_page_state(nr_writeback)) - goto out_delay; - - /* Delay prefetching if we have significant amounts of dirty data */ - pending_writes = read_page_state(nr_dirty) + - read_page_state(nr_unstable); - if (pending_writes > SWAP_CLUSTER_MAX) + /* Purposefully racy and might return false positive which is ok */ + if (__test_and_clear_bit(0, &swapped_root.busy)) goto out_delay; /* @@ -275,8 +280,19 @@ static int prefetch_suitable(void) goto out_delay; } + get_page_state(&ps); + + /* We shouldn't prefetch when we are doing writeback */ + if (ps.nr_writeback) + goto out_delay; + + /* Delay prefetching if we have significant amounts of dirty data */ + pending_writes = ps.nr_dirty + ps.nr_unstable; + if (pending_writes > SWAP_CLUSTER_MAX) + goto out_delay; + /* >2/3 of the ram is mapped, we need some free for pagecache */ - limit = read_page_state(nr_mapped); + limit = ps.nr_mapped + ps.nr_slab + pending_writes; if (limit > mapped_limit) goto out_delay;