Documentation/sysctl/vm.txt | 46 + drivers/block/loop.c | 6 fs/mpage.c | 4 fs/nfsd/vfs.c | 6 include/linux/fs.h | 23 include/linux/mm.h | 32 include/linux/page-flags.h | 38 include/linux/radix-tree.h | 80 + include/linux/sysctl.h | 3 include/linux/writeback.h | 6 kernel/sysctl.c | 34 lib/radix-tree.c | 208 ++++ mm/filemap.c | 115 +- mm/memory.c | 1 mm/page-writeback.c | 2 mm/page_alloc.c | 3 mm/readahead.c | 1856 +++++++++++++++++++++++++++++++++++++++++++- mm/swap.c | 11 mm/vmscan.c | 18 19 files changed, 2394 insertions(+), 98 deletions(-) Index: linux-2.6.15-rc5-ck1/include/linux/page-flags.h =================================================================== --- linux-2.6.15-rc5-ck1.orig/include/linux/page-flags.h +++ linux-2.6.15-rc5-ck1/include/linux/page-flags.h @@ -75,6 +75,8 @@ #define PG_reclaim 17 /* To be reclaimed asap */ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_uncached 19 /* Page has been mapped as uncached */ +#define PG_activate 20 /* delayed activate */ +#define PG_readahead 21 /* check readahead when reading this page */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -104,6 +106,8 @@ struct page_state { unsigned long pgfree; /* page freeings */ unsigned long pgactivate; /* pages moved inactive->active */ unsigned long pgdeactivate; /* pages moved active->inactive */ + unsigned long pgkeephot; /* pages sent back to active */ + unsigned long pgkeepcold; /* pages sent back to inactive */ unsigned long pgfault; /* faults (major+minor) */ unsigned long pgmajfault; /* faults (major only) */ @@ -303,6 +307,16 @@ extern void __mod_page_state(unsigned lo #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageActivate(page) test_bit(PG_activate, &(page)->flags) +#define SetPageActivate(page) set_bit(PG_activate, &(page)->flags) +#define ClearPageActivate(page) clear_bit(PG_activate, &(page)->flags) +#define TestClearPageActivate(page) test_and_clear_bit(PG_activate, &(page)->flags) +#define TestSetPageActivate(page) test_and_set_bit(PG_activate, &(page)->flags) + +#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags) +#define SetPageReadahead(page) set_bit(PG_readahead, &(page)->flags) +#define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); @@ -319,4 +333,28 @@ static inline void set_page_writeback(st test_set_page_writeback(page); } +#if PG_activate < PG_referenced +#error unexpected page flags order +#endif + +#define PAGE_REFCNT_0 0 +#define PAGE_REFCNT_1 (1 << PG_referenced) +#define PAGE_REFCNT_2 (1 << PG_activate) +#define PAGE_REFCNT_3 ((1 << PG_activate) | (1 << PG_referenced)) +#define PAGE_REFCNT_MASK PAGE_REFCNT_3 + +/* + * STATUS REFERENCE COUNT + * __ 0 + * _R PAGE_REFCNT_1 + * A_ PAGE_REFCNT_2 + * AR PAGE_REFCNT_3 + * + * A/R: Active / Referenced + */ +static inline unsigned long page_refcnt(struct page *page) +{ + return page->flags & PAGE_REFCNT_MASK; +} + #endif /* PAGE_FLAGS_H */ Index: linux-2.6.15-rc5-ck1/mm/page_alloc.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/mm/page_alloc.c +++ linux-2.6.15-rc5-ck1/mm/page_alloc.c @@ -498,6 +498,7 @@ static int prep_new_page(struct page *pa page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_activate | 1 << PG_readahead | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); set_page_refs(page, order); @@ -2274,6 +2275,8 @@ static char *vmstat_text[] = { "pgfree", "pgactivate", "pgdeactivate", + "pgkeephot", + "pgkeepcold", "pgfault", "pgmajfault", Index: linux-2.6.15-rc5-ck1/mm/swap.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/mm/swap.c +++ linux-2.6.15-rc5-ck1/mm/swap.c @@ -29,7 +29,6 @@ #include #include #include -#include /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -114,16 +113,18 @@ void fastcall activate_page(struct page * Mark a page as having seen activity. * * inactive,unreferenced -> inactive,referenced - * inactive,referenced -> active,unreferenced - * active,unreferenced -> active,referenced + * inactive,referenced -> activate,unreferenced + * activate,unreferenced -> activate,referenced */ void fastcall mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { - activate_page(page); + if (!PageActivate(page) && PageReferenced(page) && PageLRU(page)) { + SetPageActivate(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); + if (PageLRU(page) && !PageActivate(page)) + inc_readahead_aging(); } } Index: linux-2.6.15-rc5-ck1/mm/vmscan.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/mm/vmscan.c +++ linux-2.6.15-rc5-ck1/mm/vmscan.c @@ -384,6 +384,8 @@ static pageout_t pageout(struct page *pa return PAGE_CLEAN; } +extern int readahead_live_chunk; + /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ @@ -392,10 +394,14 @@ static int shrink_list(struct list_head LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; + int pgkeep = 0; int reclaimed = 0; cond_resched(); + if (readahead_live_chunk) + pgkeep += rescue_ra_pages(page_list, &ret_pages); + pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { struct address_space *mapping; @@ -421,6 +427,15 @@ static int shrink_list(struct list_head if (PageWriteback(page)) goto keep_locked; + if (PageActivate(page)) { + ClearPageActivate(page); + ClearPageReferenced(page); + goto activate_locked; + } + + if (!PageReferenced(page)) + inc_readahead_aging(); + referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ if (referenced && page_mapping_inuse(page)) @@ -568,11 +583,13 @@ keep_locked: keep: list_add(&page->lru, &ret_pages); BUG_ON(PageLRU(page)); + pgkeep++; } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); mod_page_state(pgactivate, pgactivate); + mod_page_state(pgkeepcold, pgkeep - pgactivate); sc->nr_reclaimed += reclaimed; return reclaimed; } @@ -836,6 +853,7 @@ refill_inactive_zone(struct zone *zone, mod_page_state_zone(zone, pgrefill, pgscanned); mod_page_state(pgdeactivate, pgdeactivate); + mod_page_state(pgkeephot, pgmoved); } /* Index: linux-2.6.15-rc5-ck1/lib/radix-tree.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/lib/radix-tree.c +++ linux-2.6.15-rc5-ck1/lib/radix-tree.c @@ -32,16 +32,7 @@ #include -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT 6 -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif #define RADIX_TREE_TAGS 2 - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) - #define RADIX_TREE_TAG_LONGS \ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) @@ -281,32 +272,86 @@ int radix_tree_insert(struct radix_tree_ } EXPORT_SYMBOL(radix_tree_insert); -static inline void **__lookup_slot(struct radix_tree_root *root, - unsigned long index) +/** + * radix_tree_lookup_node - low level lookup routine + * @root: radix tree root + * @index: index key + * @level: stop at that many levels from bottom + * + * Lookup the item at the position @index in the radix tree @root. + * The return value is: + * @level == 0: page at @index; + * @level == 1: the corresponding bottom level tree node; + * @level < height: (height - @level)th level tree node; + * @level >= height: root node. + */ +void *radix_tree_lookup_node(struct radix_tree_root *root, + unsigned long index, unsigned int level) { unsigned int height, shift; - struct radix_tree_node **slot; + struct radix_tree_node *slot; height = root->height; if (index > radix_tree_maxindex(height)) return NULL; shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = &root->rnode; + slot = root->rnode; - while (height > 0) { - if (*slot == NULL) + while (height > level) { + if (slot == NULL) return NULL; - slot = (struct radix_tree_node **) - ((*slot)->slots + - ((index >> shift) & RADIX_TREE_MAP_MASK)); + slot = slot->slots[(index >> shift) & RADIX_TREE_MAP_MASK]; shift -= RADIX_TREE_MAP_SHIFT; height--; } - return (void **)slot; + return slot; } +EXPORT_SYMBOL(radix_tree_lookup_node); + +/** + * radix_tree_cache_lookup_node - cached lookup node + * @root: radix tree root + * @cache: look-aside cache + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root, + * and return the node @level levels from the bottom in the search path. + * @cache stores the last accessed upper level tree node by this + * function, and is always checked first before searching in the tree. + * It can improve speed for access patterns with strong locality. + * NOTE: + * - The cache becomes invalid on leaving the lock; + * - Do not intermix calls with different @level. + */ +void *radix_tree_cache_lookup_node(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index, unsigned int level) +{ + struct radix_tree_node *node; + unsigned long i; + unsigned long mask; + + if (level >= root->height) + return root->rnode; + + i = ((index >> (level * RADIX_TREE_MAP_SHIFT)) & RADIX_TREE_MAP_MASK); + mask = ~((RADIX_TREE_MAP_SIZE << (level * RADIX_TREE_MAP_SHIFT)) - 1); + + if ((index & mask) == cache->first_index) + return cache->tree_node->slots[i]; + + node = radix_tree_lookup_node(root, index, level + 1); + if (!node) + return 0; + + cache->tree_node = node; + cache->first_index = (index & mask); + return node->slots[i]; +} +EXPORT_SYMBOL(radix_tree_cache_lookup_node); /** * radix_tree_lookup_slot - lookup a slot in a radix tree @@ -318,25 +363,134 @@ static inline void **__lookup_slot(struc */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { - return __lookup_slot(root, index); + struct radix_tree_node *node; + + node = radix_tree_lookup_node(root, index, 1); + return node->slots + (index & RADIX_TREE_MAP_MASK); } EXPORT_SYMBOL(radix_tree_lookup_slot); /** - * radix_tree_lookup - perform lookup operation on a radix tree + * radix_tree_cache_count - items in the cached node + * @cache: radix tree look-aside cache + * + * Query the number of items contained in the cached node. + */ +int radix_tree_cache_count(struct radix_tree_cache *cache) +{ + if (!(cache->first_index & RADIX_TREE_MAP_MASK)) + return cache->tree_node->count; + else + return 0; +} +EXPORT_SYMBOL(radix_tree_cache_count); + +/** + * radix_tree_lookup_head - lookup the head index * @root: radix tree root * @index: index key + * @max_scan: max items to scan * - * Lookup the item at the position @index in the radix tree @root. + * Lookup head index of the segment which contains @index. A segment is + * a set of continuous pages in a file. + * CASE RETURN VALUE + * no page at @index (not head) = @index + 1 + * found in the range @index - @max_scan < (head index) <= @index + * not found in range (unfinished head) <= @index - @max_scan */ -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +unsigned long radix_tree_lookup_head(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan) { - void **slot; + struct radix_tree_cache cache; + struct radix_tree_node *node; + int i; + unsigned long origin; - slot = __lookup_slot(root, index); - return slot != NULL ? *slot : NULL; + origin = index; + if (unlikely(max_scan > index)) + max_scan = index; + radix_tree_cache_init(&cache); + +next_node: + if (origin - index > max_scan) + goto out; + + node = radix_tree_cache_lookup_node(root, &cache, index, 1); + if (!node) + goto out; + + if (node->count == RADIX_TREE_MAP_SIZE) { + if (index < RADIX_TREE_MAP_SIZE) { + index = -1; + goto out; + } + index = (index - RADIX_TREE_MAP_SIZE) | RADIX_TREE_MAP_MASK; + goto next_node; + } + + for (i = index & RADIX_TREE_MAP_MASK; i >= 0; i--, index--) { + if (!node->slots[i]) + goto out; + } + + goto next_node; + +out: + return index + 1; +} +EXPORT_SYMBOL(radix_tree_lookup_head); + +/** + * radix_tree_lookup_tail - lookup the tail index + * @root: radix tree root + * @index: index key + * @max_scan: max items to scan + * + * Lookup tail(pass the end) index of the segment which contains @index. + * A segment is a set of continuous pages in a file. + * CASE RETURN VALUE + * found in the range @index <= (tail index) < @index + @max_scan + * not found in range @index + @max_scan <= (non tail) + */ +unsigned long radix_tree_lookup_tail(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan) +{ + struct radix_tree_cache cache; + struct radix_tree_node *node; + int i; + unsigned long origin; + + origin = index; + if (unlikely(index + max_scan < index)) + max_scan = LONG_MAX - index; + radix_tree_cache_init(&cache); + +next_node: + if (index - origin >= max_scan) + goto out; + + node = radix_tree_cache_lookup_node(root, &cache, index, 1); + if (!node) + goto out; + + if (node->count == RADIX_TREE_MAP_SIZE) { + index = (index | RADIX_TREE_MAP_MASK) + 1; + if (unlikely(!index)) + goto out; + goto next_node; + } + + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++, index++) { + if (!node->slots[i]) + goto out; + } + + goto next_node; + +out: + return index; } -EXPORT_SYMBOL(radix_tree_lookup); +EXPORT_SYMBOL(radix_tree_lookup_tail); /** * radix_tree_tag_set - set a tag on a radix tree node Index: linux-2.6.15-rc5-ck1/include/linux/radix-tree.h =================================================================== --- linux-2.6.15-rc5-ck1.orig/include/linux/radix-tree.h +++ linux-2.6.15-rc5-ck1/include/linux/radix-tree.h @@ -22,12 +22,24 @@ #include #include +#define RADIX_TREE_MAP_SHIFT 6 +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + struct radix_tree_root { unsigned int height; gfp_t gfp_mask; struct radix_tree_node *rnode; }; +/* + * Support access patterns with strong locality. + */ +struct radix_tree_cache { + unsigned long first_index; + struct radix_tree_node *tree_node; +}; + #define RADIX_TREE_INIT(mask) { \ .height = 0, \ .gfp_mask = (mask), \ @@ -45,9 +57,18 @@ do { \ } while (0) int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); -void *radix_tree_lookup(struct radix_tree_root *, unsigned long); -void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long, + unsigned int); +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long); void *radix_tree_delete(struct radix_tree_root *, unsigned long); +int radix_tree_cache_count(struct radix_tree_cache *cache); +void *radix_tree_cache_lookup_node(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index, unsigned int level); +unsigned long radix_tree_lookup_head(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan); +unsigned long radix_tree_lookup_tail(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); @@ -69,4 +90,59 @@ static inline void radix_tree_preload_en preempt_enable(); } +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + */ +static inline void *radix_tree_lookup(struct radix_tree_root *root, + unsigned long index) +{ + return radix_tree_lookup_node(root, index, 0); +} + +/** + * radix_tree_cache_init - init the cache + * @cache: look-aside cache + * + * Init the @cache. + */ +static inline void radix_tree_cache_init(struct radix_tree_cache *cache) +{ + cache->first_index = RADIX_TREE_MAP_MASK; + cache->tree_node = NULL; +} + +/** + * radix_tree_cache_lookup - cached lookup page + * @root: radix tree root + * @cache: look-aside cache + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + */ +static inline void *radix_tree_cache_lookup(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index) +{ + return radix_tree_cache_lookup_node(root, cache, index, 0); +} + +static inline int radix_tree_cache_size(struct radix_tree_cache *cache) +{ + return RADIX_TREE_MAP_SIZE; +} + +static inline int radix_tree_cache_full(struct radix_tree_cache *cache) +{ + return radix_tree_cache_count(cache) == radix_tree_cache_size(cache); +} + +static inline int radix_tree_cache_first_index(struct radix_tree_cache *cache) +{ + return cache->first_index; +} + #endif /* _LINUX_RADIX_TREE_H */ Index: linux-2.6.15-rc5-ck1/include/linux/mm.h =================================================================== --- linux-2.6.15-rc5-ck1.orig/include/linux/mm.h +++ linux-2.6.15-rc5-ck1/include/linux/mm.h @@ -906,7 +906,7 @@ extern int filemap_populate(struct vm_ar int write_one_page(struct page *page, int wait); /* readahead.c */ -#define VM_MAX_READAHEAD 128 /* kbytes */ +#define VM_MAX_READAHEAD 1024 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ #define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before * turning readahead off */ @@ -923,6 +923,36 @@ unsigned long page_cache_readahead(struc void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset); unsigned long max_sane_readahead(unsigned long nr); +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *prev_page, struct page *page, + pgoff_t first_index, + pgoff_t index, pgoff_t last_index); +unsigned long +page_cache_readaround(struct address_space *mapping, struct file_ra_state *ra, + struct file *filp, pgoff_t index); +void fastcall ra_access(struct file_ra_state *ra, struct page *page); +int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list); + +#ifdef CONFIG_DEBUG_FS +extern u32 readahead_debug_level; +#define READAHEAD_DEBUG_LEVEL(n) (readahead_debug_level >= n) +#else +#define READAHEAD_DEBUG_LEVEL(n) (0) +#endif + +extern int readahead_ratio; +static inline int prefer_adaptive_readahead(void) +{ + return readahead_ratio > 9; +} + +DECLARE_PER_CPU(unsigned long, readahead_aging); +static inline void inc_readahead_aging(void) +{ + __get_cpu_var(readahead_aging)++; +} /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); Index: linux-2.6.15-rc5-ck1/mm/readahead.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/mm/readahead.c +++ linux-2.6.15-rc5-ck1/mm/readahead.c @@ -14,6 +14,331 @@ #include #include #include +#include +#include + +/* The default max/min read-ahead pages. */ +#define KB(size) (((size)*1024 + PAGE_CACHE_SIZE-1) / PAGE_CACHE_SIZE) +#define MAX_RA_PAGES KB(VM_MAX_READAHEAD) +#define MIN_RA_PAGES KB(VM_MIN_READAHEAD) +#define MIN_NFSD_PAGES KB(NFSSVC_MAXBLKSIZE/1024) + +/* In laptop mode, poll delayed look-ahead on every ## pages read. */ +#define LAPTOP_POLL_INTERVAL 16 + +/* Set look-ahead size to 1/# of the thrashing-threshold. */ +#define LOOKAHEAD_RATIO 8 + +/* Set read-ahead size to ##% of the thrashing-threshold. */ +int readahead_ratio = 50; +EXPORT_SYMBOL(readahead_ratio); + +/* Readahead as long as cache hit ratio keeps above 1/##. */ +int readahead_hit_rate = 2; +EXPORT_SYMBOL(readahead_hit_rate); + +/* Scan backward ## pages to find a live reader. */ +int readahead_live_chunk = 0; +EXPORT_SYMBOL(readahead_live_chunk); + +/* Analog to zone->aging_total. + * But mainly increased on fresh page references, so is much more smoother. + */ +DEFINE_PER_CPU(unsigned long, readahead_aging); +EXPORT_PER_CPU_SYMBOL(readahead_aging); + +/* Detailed classification of read-ahead behaviors. */ +#define RA_CLASS_SHIFT 4 +#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1) +enum ra_class { + RA_CLASS_ALL, + RA_CLASS_NEWFILE, + RA_CLASS_STATE, + RA_CLASS_CONTEXT, + RA_CLASS_CONTEXT_ACCELERATED, + RA_CLASS_AROUND, + RA_CLASS_BACKWARD, + RA_CLASS_RANDOM_THRASHING, + RA_CLASS_RANDOM_SEEK, + RA_CLASS_END, +}; + +/* Read-ahead events to be accounted. */ +enum ra_event { + RA_EVENT_CACHE_MISS, /* read cache misses */ + RA_EVENT_READRANDOM, /* random reads */ + RA_EVENT_IO_CONGESTION, /* io congestion */ + RA_EVENT_IO_CACHE_HIT, /* canceled io due to cache hit */ + RA_EVENT_IO_BLOCK, /* read on locked page */ + + RA_EVENT_READAHEAD, /* read-ahead issued */ + RA_EVENT_READAHEAD_HIT, /* read-ahead page hit */ + RA_EVENT_LOOKAHEAD, /* look-ahead issued */ + RA_EVENT_LOOKAHEAD_HIT, /* look-ahead mark hit */ + RA_EVENT_LOOKAHEAD_NOACTION, /* look-ahead mark ignored */ + RA_EVENT_READAHEAD_MMAP, /* read-ahead for memory mapped file */ + RA_EVENT_READAHEAD_EOF, /* read-ahead reaches EOF */ + RA_EVENT_READAHEAD_SHRINK, /* ra_size decreased, reflects var. */ + RA_EVENT_READAHEAD_THRASHING, /* read-ahead thrashing happened */ + RA_EVENT_READAHEAD_MUTILATE, /* read-ahead request mutilated */ + RA_EVENT_READAHEAD_RESCUE, /* read-ahead rescued */ + + RA_EVENT_END +}; + +#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru)) +#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru)) + +/* + * Debug facilities. + */ +#ifdef CONFIG_DEBUG_FS +#define DEBUG_READAHEAD +#endif + +#ifdef DEBUG_READAHEAD +#define dprintk(args...) \ + do { if (READAHEAD_DEBUG_LEVEL(1)) printk(KERN_DEBUG args); } while(0) +#define ddprintk(args...) \ + do { if (READAHEAD_DEBUG_LEVEL(2)) printk(KERN_DEBUG args); } while(0) +#else +#define dprintk(args...) do { } while(0) +#define ddprintk(args...) do { } while(0) +#endif + +#ifdef DEBUG_READAHEAD +#include +#include +#include +#include + +u32 readahead_debug_level = 0; +u32 debug_disable_stateful_method = 0; + +static struct dentry *readahead_events_dentry; +extern struct file_operations ra_debug_fops; + +static int __init readahead_init(void) +{ + struct dentry *root; + + root = debugfs_create_dir("readahead", NULL); + + debugfs_create_u32("debug_level", 0644, root, &readahead_debug_level); + debugfs_create_bool("disable_stateful_method", 0644, root, + &debug_disable_stateful_method); + + readahead_events_dentry = debugfs_create_file("events", + 0644, root, NULL, &ra_debug_fops); + + return 0; +} + +module_init(readahead_init) + +static inline int disable_stateful_method(void) +{ + return debug_disable_stateful_method; +} + +#else /* !DEBUG_READAHEAD */ + +static inline int disable_stateful_method(void) +{ + return 0; +} + +#endif + +/* + * Read-ahead events accounting. + */ +#ifdef DEBUG_READAHEAD + +static char *ra_class_name[] = { + "total", + "newfile", + "state", + "context", + "contexta", + "around", + "backward", + "onthrash", + "onraseek", + "none", +}; + +static char *ra_event_name[] = { + "cache_miss", + "read_random", + "io_congestion", + "io_cache_hit", + "io_block", + "readahead", + "readahead_hit", + "lookahead", + "lookahead_hit", + "lookahead_ignore", + "readahead_mmap", + "readahead_eof", + "readahead_shrink", + "readahead_thrash", + "readahead_mutilt", + "readahead_rescue", +}; + +static unsigned long ra_event_count[RA_CLASS_END+1][RA_EVENT_END+1][2]; + +static inline void ra_account(struct file_ra_state *ra, + enum ra_event e, int pages) +{ + enum ra_class c; + + if (e == RA_EVENT_READAHEAD_HIT && pages < 0) { + c = (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK; + pages = -pages; + } else if (ra) + c = ra->flags & RA_CLASS_MASK; + else + c = RA_CLASS_END; + + if (!c) + c = RA_CLASS_END; + + ra_event_count[c][e][0] += 1; + ra_event_count[c][e][1] += pages; + + if (e == RA_EVENT_READAHEAD) + ra_event_count[c][RA_EVENT_END][1] += pages * pages; +} + +static int ra_account_show(struct seq_file *s, void *_) +{ + int i; + int c; + int e; + static char event_fmt[] = "%-16s"; + static char class_fmt[] = "%10s"; + static char item_fmt[] = "%10lu"; + static char percent_format[] = "%9lu%%"; + static char *table_name[] = { + "[table requests]", + "[table pages]", + "[table summary]"}; + + for (i = 0; i <= 1; i++) { + for (e = 0; e <= RA_EVENT_END; e++) { + ra_event_count[0][e][i] = 0; + for (c = 1; c <= RA_CLASS_END; c++) + ra_event_count[0][e][i] += + ra_event_count[c][e][i]; + } + + seq_printf(s, event_fmt, table_name[i]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, class_fmt, ra_class_name[c]); + seq_puts(s, "\n"); + + for (e = 0; e < RA_EVENT_END; e++) { + if (e == RA_EVENT_READAHEAD_HIT && i == 0) + continue; + + seq_printf(s, event_fmt, ra_event_name[e]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + ra_event_count[c][e][i]); + seq_puts(s, "\n"); + } + seq_puts(s, "\n"); + } + + seq_printf(s, event_fmt, table_name[2]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, class_fmt, ra_class_name[c]); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "random_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_event_count[c][RA_EVENT_READRANDOM][0] * 100) / + (ra_event_count[c][RA_EVENT_READRANDOM][0] + + ra_event_count[c][RA_EVENT_READAHEAD][0] + 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "ra_hit_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_event_count[c][RA_EVENT_READAHEAD_HIT][1] * 100) / + (ra_event_count[c][RA_EVENT_READAHEAD][1] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "la_hit_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_event_count[c][RA_EVENT_LOOKAHEAD_HIT][0] * 100) / + (ra_event_count[c][RA_EVENT_LOOKAHEAD][0] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "var_ra_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_event_count[c][RA_EVENT_END][1] - + ra_event_count[c][RA_EVENT_READAHEAD][1] * + (ra_event_count[c][RA_EVENT_READAHEAD][1] / + (ra_event_count[c][RA_EVENT_READAHEAD][0] | 1))) / + (ra_event_count[c][RA_EVENT_READAHEAD][0] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "avg_ra_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_event_count[c][RA_EVENT_READAHEAD][1] + + ra_event_count[c][RA_EVENT_READAHEAD][0] / 2) / + (ra_event_count[c][RA_EVENT_READAHEAD][0] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "avg_la_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_event_count[c][RA_EVENT_LOOKAHEAD][1] + + ra_event_count[c][RA_EVENT_LOOKAHEAD][0] / 2) / + (ra_event_count[c][RA_EVENT_LOOKAHEAD][0] | 1)); + seq_puts(s, "\n"); + + return 0; +} + +static int ra_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, ra_account_show, NULL); +} + +static ssize_t ra_debug_write(struct file *file, const char __user *buf, + size_t size, loff_t *offset) +{ + if (file->f_dentry == readahead_events_dentry) + memset(ra_event_count, 0, sizeof(ra_event_count)); + return 1; +} + +static struct file_operations ra_debug_fops = { + .owner = THIS_MODULE, + .open = ra_debug_open, + .write = ra_debug_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#else /* !DEBUG_READAHEAD */ + +static inline void ra_account(struct file_ra_state *ra, + enum ra_event e, int pages) +{ +} + +#endif /* DEBUG_READAHEAD */ + void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -21,7 +346,7 @@ void default_unplug_io_fn(struct backing EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + .ra_pages = MAX_RA_PAGES, .state = 0, .capabilities = BDI_CAP_MAP_COPY, .unplug_io_fn = default_unplug_io_fn, @@ -49,7 +374,7 @@ static inline unsigned long get_max_read static inline unsigned long get_min_readahead(struct file_ra_state *ra) { - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; + return MIN_RA_PAGES; } static inline void ra_off(struct file_ra_state *ra) @@ -172,8 +497,10 @@ static int read_pages(struct address_spa if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { mapping->a_ops->readpage(filp, page); - if (!pagevec_add(&lru_pvec, page)) + if (!pagevec_add(&lru_pvec, page)) { + cond_resched(); __pagevec_lru_add(&lru_pvec); + } } else { page_cache_release(page); } @@ -254,10 +581,11 @@ out: */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) { struct inode *inode = mapping->host; - struct page *page; + struct page *page = NULL; unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); int page_idx; @@ -267,7 +595,7 @@ __do_page_cache_readahead(struct address if (isize == 0) goto out; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); /* * Preallocate as many pages as we will need. @@ -275,21 +603,31 @@ __do_page_cache_readahead(struct address read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { pgoff_t page_offset = offset + page_idx; - + if (page_offset > end_index) break; page = radix_tree_lookup(&mapping->page_tree, page_offset); - if (page) + if (page) { +#ifdef READAHEAD_STREAMING + if (prefer_adaptive_readahead() && + page_idx == nr_to_read - lookahead_size) + SetPageReadahead(page); +#endif continue; + } read_unlock_irq(&mapping->tree_lock); + cond_resched(); page = page_cache_alloc_cold(mapping); read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; list_add(&page->lru, &page_pool); + if (prefer_adaptive_readahead() && + page_idx == nr_to_read - lookahead_size) + SetPageReadahead(page); ret++; } read_unlock_irq(&mapping->tree_lock); @@ -326,7 +664,7 @@ int force_page_cache_readahead(struct ad if (this_chunk > nr_to_read) this_chunk = nr_to_read; err = __do_page_cache_readahead(mapping, filp, - offset, this_chunk); + offset, this_chunk, 0); if (err < 0) { ret = err; break; @@ -373,7 +711,7 @@ int do_page_cache_readahead(struct addre if (bdi_read_congested(mapping->backing_dev_info)) return -1; - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); } /* @@ -393,7 +731,10 @@ blockable_page_cache_readahead(struct ad if (!block && bdi_read_congested(mapping->backing_dev_info)) return 0; - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); + + dprintk("blockable-readahead(ino=%lu, ra=%lu+%lu) = %d\n", + mapping->host->i_ino, offset, nr_to_read, actual); return check_ra_success(ra, nr_to_read, actual); } @@ -569,3 +910,1496 @@ unsigned long max_sane_readahead(unsigne __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id())); return min(nr, (inactive + free) / 2); } + +/* + * Adaptive read-ahead. + * + * Good read patterns are compact both in space and time. The read-ahead logic + * tries to grant larger read-ahead size to better readers under the constraint + * of system memory and load pressures. + * + * It employs two methods to estimate the max thrashing safe read-ahead size: + * 1. state based - the default one + * 2. context based - the fail safe one + * The integration of the dual methods has the merit of being agile and robust. + * It makes the overall design clean: special cases are handled in general by + * the stateless method, leaving the stateful one simple and fast. + * + * To improve throughput and decrease read delay, the logic 'looks ahead'. + * In most read-ahead chunks, one page will be selected and tagged with + * PG_readahead. Later when the page with PG_readahead is read, the logic + * will be notified to submit the next read-ahead chunk in advance. + * + * a read-ahead chunk + * +-----------------------------------------+ + * | # PG_readahead | + * +-----------------------------------------+ + * ^ When this page is read, notify me for the next read-ahead. + * + * + * Here are some variable names used frequently: + * + * |<------- la_size ------>| + * +-----------------------------------------+ + * | # | + * +-----------------------------------------+ + * ra_index -->|<---------------- ra_size -------------->| + * + */ + +/* + * The nature of read-ahead allows most tests to fail or even be wrong. + * Here we just do not bother to call get_page(), it's meaningless anyway. + */ +static inline struct page *__find_page(struct address_space *mapping, + pgoff_t offset) +{ + return radix_tree_lookup(&mapping->page_tree, offset); +} + +static inline struct page *find_page(struct address_space *mapping, + pgoff_t offset) +{ + struct page *page; + + read_lock_irq(&mapping->tree_lock); + page = __find_page(mapping, offset); + read_unlock_irq(&mapping->tree_lock); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (page) + BUG_ON(page->index != offset); +#endif + return page; +} + +/* + * Move pages in danger (of thrashing) to the head of inactive_list. + * Not expected to happen frequently. + */ +static int rescue_pages(struct page *page, pgoff_t nr_pages) +{ + unsigned long pgrescue; + pgoff_t index; + struct address_space *mapping; + struct zone *zone; + + BUG_ON(!nr_pages || !page); + pgrescue = 0; + index = page_index(page); + mapping = page_mapping(page); + + dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n", + mapping->host->i_ino, index, nr_pages); + + for(;;) { + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + + if (!PageLRU(page)) + goto out_unlock; + + while (page_mapping(page) == mapping && + page_index(page) == index) { + struct page *the_page = page; + page = next_page(page); + if (!PageActive(the_page) && + !PageActivate(the_page) && + !PageLocked(the_page) && + page_count(the_page) == 1) { + list_move(&the_page->lru, &zone->inactive_list); + pgrescue++; + } + index++; + if (!--nr_pages) + goto out_unlock; + } + + spin_unlock_irq(&zone->lru_lock); + + cond_resched(); + page = find_page(mapping, index); + if (!page) + goto out; + } +out_unlock: + spin_unlock_irq(&zone->lru_lock); +out: + ra_account(0, RA_EVENT_READAHEAD_RESCUE, pgrescue); + + return nr_pages ? index : 0; +} + +/* + * Set a new look-ahead mark at @new_index. + * Return 0 if the new mark is successfully set. + */ +static inline int renew_lookahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t index, pgoff_t new_index) +{ + struct page *page; + + if (index == ra->lookahead_index && + new_index >= ra->readahead_index) + return 1; + + page = find_page(mapping, new_index); + if (!page) + return 1; + + SetPageReadahead(page); + if (ra->lookahead_index == index) + ra->lookahead_index = new_index; + return 0; +} + +/* + * State based calculation of read-ahead request. + * + * This figure shows the meaning of file_ra_state members: + * + * chunk A chunk B + * +---------------------------+-------------------------------------------+ + * | # | # | + * +---------------------------+-------------------------------------------+ + * ^ ^ ^ ^ + * la_index ra_index lookahead_index readahead_index + */ + +/* + * The global effective length of the inactive_list(s). + */ +static unsigned long node_free_and_cold_pages(void) +{ + unsigned int i; + unsigned long sum = 0; + struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += zones[i].nr_inactive + + zones[i].free_pages - zones[i].pages_low; + + return sum; +} + +/* + * A more smooth and timely analog to zone->aging_total. + */ +static unsigned long node_readahead_aging(void) +{ + unsigned long cpu; + unsigned long sum = 0; + cpumask_t mask = node_to_cpumask(numa_node_id()); + + cond_resched(); + for_each_cpu_mask(cpu, mask) + sum += per_cpu(readahead_aging, cpu); + + return sum; +} + +/* + * Set class of read-ahead + */ +static inline void set_ra_class(struct file_ra_state *ra, + enum ra_class ra_class) +{ + unsigned long FLAGS_MASK; + unsigned long flags; + unsigned long old_ra_class; + + FLAGS_MASK = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT)); + flags = ra->flags & FLAGS_MASK; + + old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT; + + ra->flags = flags | old_ra_class | ra_class; +} + +/* + * The 64bit cache_hit stores three accumulated value and one counter value. + * MSB LSB + * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000 + */ +static inline int ra_cache_hit(struct file_ra_state *ra, int nr) +{ + return (ra->cache_hit >> (nr * 16)) & 0xFFFF; +} + +/* + * Conceptual code: + * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0); + * ra_cache_hit(ra, 0) = 0; + */ +static inline void ra_addup_cache_hit(struct file_ra_state *ra) +{ + int n; + + n = ra_cache_hit(ra, 0); + ra->cache_hit -= n; + n <<= 16; + ra->cache_hit += n; +} + +/* + * The read-ahead is deemed success if cache-hit-rate >= 1/readahead_hit_rate. + */ +static inline int ra_cache_hit_ok(struct file_ra_state *ra) +{ + return ra_cache_hit(ra, 0) * readahead_hit_rate >= + (ra->lookahead_index - ra->la_index); +} + +/* + * Check if @index falls in the @ra request. + */ +static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) +{ + if (index < ra->la_index || index >= ra->readahead_index) + return 0; + + if (index >= ra->ra_index) + return 1; + else + return -1; +} + +/* + * Prepare file_ra_state for a new read-ahead sequence. + */ +static inline void ra_state_init(struct file_ra_state *ra, + pgoff_t la_index, pgoff_t ra_index) +{ + ra_addup_cache_hit(ra); + ra->cache_hit <<= 16; + ra->lookahead_index = la_index; + ra->readahead_index = ra_index; +} + +/* + * Take down a new read-ahead request in file_ra_state. + */ +static inline void ra_state_update(struct file_ra_state *ra, + unsigned long ra_size, unsigned long la_size) +{ +#ifdef DEBUG_READAHEAD + unsigned long old_ra = ra->readahead_index - ra->ra_index; + if (ra_size < old_ra && ra_cache_hit(ra, 0)) + ra_account(ra, RA_EVENT_READAHEAD_SHRINK, old_ra - ra_size); +#endif + + /* Disable look-ahead for loopback file. */ + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD)) + la_size = 0; + + ra_addup_cache_hit(ra); + ra->ra_index = ra->readahead_index; + ra->la_index = ra->lookahead_index; + ra->readahead_index += ra_size; + ra->lookahead_index = ra->readahead_index - la_size; + ra->age = node_readahead_aging(); +} + +/* + * Adjust the read-ahead request in file_ra_state. + */ +static inline void ra_state_adjust(struct file_ra_state *ra, + unsigned long ra_size, unsigned long la_size) +{ + ra->readahead_index = ra->ra_index + ra_size; + ra->lookahead_index = ra->readahead_index - la_size; +} + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static int ra_dispatch(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + pgoff_t eof_index; + unsigned long ra_size; + unsigned long la_size; + int actual; + enum ra_class ra_class; + + cond_resched(); + ra_class = (ra->flags & RA_CLASS_MASK); + BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END); + + eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1; + ra_size = ra->readahead_index - ra->ra_index; + la_size = ra->readahead_index - ra->lookahead_index; + + /* Snap to EOF. */ + if (unlikely(ra->ra_index >= eof_index)) + return 0; + if (ra->readahead_index + ra_size / 2 > eof_index) { + if (ra_class == RA_CLASS_CONTEXT_ACCELERATED && + eof_index > ra->lookahead_index + 1) + la_size = eof_index - ra->lookahead_index; + else + la_size = 0; + ra_size = eof_index - ra->ra_index; + ra_state_adjust(ra, ra_size, la_size); + } + + actual = __do_page_cache_readahead(mapping, filp, + ra->ra_index, ra_size, la_size); + cond_resched(); + +#ifdef READAHEAD_STREAMING + if (actual < ra_size) { + struct page *page = find_page(mapping, ra->ra_index + actual); + if (page) + rescue_pages(page, ra_size); + } +#endif + + if (ra->flags & RA_FLAG_MMAP) + ra_account(ra, RA_EVENT_READAHEAD_MMAP, actual); + if (ra->readahead_index == eof_index) + ra_account(ra, RA_EVENT_READAHEAD_EOF, actual); + if (la_size) + ra_account(ra, RA_EVENT_LOOKAHEAD, la_size); + if (ra_size > actual) + ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual); + ra_account(ra, RA_EVENT_READAHEAD, actual); + + dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n", + ra_class_name[ra_class], + mapping->host->i_ino, ra->la_index, + ra->ra_index, ra_size, la_size, actual); + + return actual; +} + +/* + * Determine the request parameters from primitive values. + * + * It applies the following rules: + * - Substract ra_size by the old look-ahead to get real safe read-ahead; + * - Set new la_size according to the (still large) ra_size; + * - Apply upper limits; + * - Make sure stream_shift is not too small. + * (So that the next global_shift will not be too small.) + * + * Input: + * ra_size stores the estimated thrashing-threshold. + * la_size stores the look-ahead size of previous request. + */ +static inline int adjust_rala(unsigned long ra_max, + unsigned long *ra_size, unsigned long *la_size) +{ + unsigned long stream_shift = *la_size; + + if (*ra_size > *la_size) + *ra_size -= *la_size; + else + return 0; + + *la_size = *ra_size / LOOKAHEAD_RATIO; + + if (*ra_size > ra_max) + *ra_size = ra_max; + if (*la_size > *ra_size) + *la_size = *ra_size; + + stream_shift += (*ra_size - *la_size); + if (stream_shift < *ra_size / 4) + *la_size -= (*ra_size / 4 - stream_shift); + + return 1; +} + +/* + * The function estimates two values: + * 1. thrashing-threshold for the current stream + * It is returned to make the next read-ahead request. + * 2. the remained space for the current chunk + * It will be checked to ensure that the current chunk is safe. + * + * The computation will be pretty accurate under heavy load, and will vibrate + * more with light load(small global_shift), so the grow speed of ra_size + * must be limited, and a moderate large stream_shift must be insured. + * + * This figure illustrates the formula used in the function: + * While the stream reads stream_shift pages inside the chunks, + * the chunks are shifted global_shift pages inside inactive_list. + * + * chunk A chunk B + * |<=============== global_shift ================| + * +-------------+ +-------------------+ | + * | # | | # | inactive_list | + * +-------------+ +-------------------+ head | + * |---->| |---------->| + * | | + * +-- stream_shift --+ + */ +static inline unsigned long compute_thrashing_threshold( + struct file_ra_state *ra, + unsigned long *remain) +{ + unsigned long global_size; + unsigned long global_shift; + unsigned long stream_shift; + unsigned long ra_size; + + global_size = node_free_and_cold_pages(); + global_shift = node_readahead_aging() - ra->age; + global_shift |= 1UL; + stream_shift = ra_cache_hit(ra, 0); + + ra_size = stream_shift * + global_size * readahead_ratio / (100 * global_shift); + + if (global_size > global_shift) + *remain = stream_shift * + (global_size - global_shift) / global_shift; + else + *remain = 0; + + ddprintk("compute_thrashing_threshold: " + "ra=%lu=%lu*%lu/%lu, remain %lu for %lu\n", + ra_size, stream_shift, global_size, global_shift, + *remain, ra->readahead_index - ra->lookahead_index); + + return ra_size; +} + +/* + * Main function for file_ra_state based read-ahead. + */ +static inline unsigned long +state_based_readahead(struct address_space *mapping, struct file *filp, + struct file_ra_state *ra, struct page *page, + unsigned long ra_size, unsigned long ra_max) +{ + unsigned long ra_old; + unsigned long la_size; + unsigned long remain_space; + unsigned long growth_limit; + + la_size = ra->readahead_index - ra->lookahead_index; + ra_old = ra->readahead_index - ra->ra_index; + growth_limit = ra_size + (2 + readahead_ratio / 64) * ra_old + + (ra_max - ra_old) / 16; + ra_size = compute_thrashing_threshold(ra, &remain_space); + + if (!readahead_live_chunk && + remain_space <= la_size && la_size > 1) { + rescue_pages(page, la_size); + return 0; + } + + if (!adjust_rala(min(ra_max, growth_limit), &ra_size, &la_size)) + return 0; + + set_ra_class(ra, RA_CLASS_STATE); + ra_state_update(ra, ra_size, la_size); + + return ra_dispatch(ra, mapping, filp); +} + +/* + * Page cache context based estimation of read-ahead/look-ahead size/index. + * + * The logic first looks around to find the start point of next read-ahead, + * and then, if necessary, looks backward in the inactive_list to get an + * estimation of the thrashing-threshold. + * + * The estimation theory can be illustrated with figure: + * + * chunk A chunk B chunk C head + * + * l01 l11 l12 l21 l22 + *| |-->|-->| |------>|-->| |------>| + *| +-------+ +-----------+ +-------------+ | + *| | # | | # | | # | | + *| +-------+ +-----------+ +-------------+ | + *| |<==============|<===========================|<============================| + * L0 L1 L2 + * + * Let f(l) = L be a map from + * l: the number of pages read by the stream + * to + * L: the number of pages pushed into inactive_list in the mean time + * then + * f(l01) <= L0 + * f(l11 + l12) = L1 + * f(l21 + l22) = L2 + * ... + * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...) + * <= Length(inactive_list) = f(thrashing-threshold) + * + * So the count of countinuous history pages left in the inactive_list is always + * a lower estimation of the true thrashing-threshold. + */ + +/* + * STATUS REFERENCE COUNT TYPE + * A__ 0 not in inactive list + * ___ 0 fresh + * __R PAGE_REFCNT_1 stale + * _a_ PAGE_REFCNT_2 disturbed once + * _aR PAGE_REFCNT_3 disturbed twice + * + * A/a/R: Active / aCTIVATE / Referenced + */ +static inline unsigned long cold_page_refcnt(struct page *page) +{ + if (!page || PageActive(page)) + return 0; + + return page_refcnt(page); +} + +static inline char page_refcnt_symbol(struct page *page) +{ + if (!page) + return 'X'; + if (PageActive(page)) + return 'A'; + switch (page_refcnt(page)) { + case 0: + return '_'; + case PAGE_REFCNT_1: + return '-'; + case PAGE_REFCNT_2: + return '='; + case PAGE_REFCNT_3: + return '#'; + } + return '?'; +} + +/* + * Count/estimate cache hits in range [first_index, last_index]. + * The estimation is simple and optimistic. + */ +static int count_cache_hit(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index) +{ + struct page *page; + int size = last_index - first_index + 1; + int count = 0; + int i; + + cond_resched(); + read_lock_irq(&mapping->tree_lock); + + /* + * The first page may well is chunk head and has been accessed, + * so it is index 0 that makes the estimation optimistic. This + * behavior guarantees a readahead when (size < ra_max) and + * (readahead_hit_rate >= 16). + */ + for (i = 0; i < 16;) { + page = __find_page(mapping, first_index + + size * ((i++ * 29) & 15) / 16); + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2) + break; + } + + read_unlock_irq(&mapping->tree_lock); + + return size * count / i; +} + +/* + * Look back and check history pages to estimate thrashing-threshold. + */ +static int query_page_cache(struct address_space *mapping, + struct file_ra_state *ra, + unsigned long *remain, pgoff_t offset, + unsigned long ra_min, unsigned long ra_max) +{ + int count; + pgoff_t index; + unsigned long nr_lookback; + struct radix_tree_cache cache; + + /* + * Scan backward and check the near @ra_max pages. + * The count here determines ra_size. + */ + cond_resched(); + read_lock_irq(&mapping->tree_lock); + index = radix_tree_lookup_head(&mapping->page_tree, offset, ra_max); + read_unlock_irq(&mapping->tree_lock); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (index <= offset) { + WARN_ON(!find_page(mapping, index)); + if (index + ra_max > offset) + WARN_ON(find_page(mapping, index - 1)); + } else { + BUG_ON(index > offset + 1); + WARN_ON(find_page(mapping, offset)); + } +#endif + + *remain = offset - index + 1; + + if (unlikely(*remain <= ra_min)) + return ra_min; + + if (!index) + return *remain; + + if (offset + 1 == ra->readahead_index && ra_cache_hit_ok(ra)) + count = *remain; + else if (count_cache_hit(mapping, index, offset) * + readahead_hit_rate >= *remain) + count = *remain; + else + return ra_min; + + if (count < ra_max) + goto out; + + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD)) + goto out; + + /* + * Check the far pages coarsely. + * The big count here helps increase la_size. + */ + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) * + 100 / (readahead_ratio + 1); + if (nr_lookback > offset) + nr_lookback = offset; + + cond_resched(); + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (count += ra_max; count < nr_lookback; count += ra_max) { + struct radix_tree_node *node; + node = radix_tree_cache_lookup_node(&mapping->page_tree, + &cache, offset - count, 1); + if (!node) + break; +#ifdef DEBUG_READAHEAD_RADIXTREE + if (node != radix_tree_lookup_node(&mapping->page_tree, + offset - count, 1)) { + read_unlock_irq(&mapping->tree_lock); + printk(KERN_ERR "check radix_tree_cache_lookup_node!\n"); + return 1; + } +#endif + } + read_unlock_irq(&mapping->tree_lock); + + /* + * For sequential read that extends from index 0, the counted value + * may well be far under the true threshold, so return it unmodified + * for further process in adjust_rala_accelerated(). + */ + if (count >= offset) + return offset + 1; + +out: + count = count * readahead_ratio / 100; + return count; +} + +/* + * Scan backward in the file for the first non-present page. + */ +static inline pgoff_t first_absent_page_bw(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + struct radix_tree_cache cache; + struct page *page; + pgoff_t origin; + + origin = index; + if (max_scan > index) + max_scan = index; + + cond_resched(); + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (; origin - index <= max_scan;) { + page = radix_tree_cache_lookup(&mapping->page_tree, + &cache, --index); + if (page) { + index++; + break; + } + } + read_unlock_irq(&mapping->tree_lock); + + return index; +} + +/* + * Scan forward in the file for the first non-present page. + */ +static inline pgoff_t first_absent_page(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + pgoff_t ra_index; + + cond_resched(); + read_lock_irq(&mapping->tree_lock); + ra_index = radix_tree_lookup_tail(&mapping->page_tree, + index + 1, max_scan); + read_unlock_irq(&mapping->tree_lock); + +#ifdef DEBUG_READAHEAD_RADIXTREE + BUG_ON(ra_index <= index); + if (index + max_scan > index) { + if (ra_index <= index + max_scan) + WARN_ON(find_page(mapping, ra_index)); + WARN_ON(!find_page(mapping, ra_index - 1)); + } +#endif + + if (ra_index <= index + max_scan) + return ra_index; + else + return 0; +} + +/* + * Determine the request parameters for context based read-ahead that extends + * from start of file. + * + * The major weakness of stateless method is perhaps the slow grow up speed of + * ra_size. The logic tries to make up for this in the important case of + * sequential reads that extend from start of file. In this case, the ra_size + * is not choosed to make the whole next chunk safe(as in normal ones). Only + * half of which is safe. The added 'unsafe' half is the look-ahead part. It + * is expected to be safeguarded by rescue_pages() when the previous chunks are + * lost. + */ +static inline int adjust_rala_accelerated(unsigned long ra_max, + unsigned long *ra_size, unsigned long *la_size) +{ + pgoff_t index = *ra_size; + + *ra_size -= min(*ra_size, *la_size); + *ra_size = *ra_size * readahead_ratio / 100; + *la_size = index * readahead_ratio / 100; + *ra_size += *la_size; + + if (*ra_size > ra_max) + *ra_size = ra_max; + if (*la_size > *ra_size) + *la_size = *ra_size; + + return 1; +} + +/* + * Main function for page context based read-ahead. + */ +static inline int +try_context_based_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct page *prev_page, struct page *page, + pgoff_t index, unsigned long ra_size, + unsigned long ra_min, unsigned long ra_max) +{ + pgoff_t ra_index; + unsigned long la_size; + unsigned long remain_pages; + + /* Where to start read-ahead? + * NFSv3 daemons may process adjecent requests in parallel, + * leading to many locally disordered, globally sequential reads. + * So do not require nearby history pages to be present or accessed. + */ + if (page) { + ra_index = first_absent_page(mapping, index, ra_max * 5 / 4); + if (unlikely(!ra_index)) + return -1; + } else if (!prev_page) { + ra_index = first_absent_page_bw(mapping, index, + readahead_hit_rate + ra_min); + if (index - ra_index > readahead_hit_rate + ra_min) + return 0; + ra_min += 2 * (index - ra_index); + index = ra_index; + } else { + ra_index = index; + if (ra_has_index(ra, index)) + ra_account(ra, RA_EVENT_READAHEAD_MUTILATE, + ra->readahead_index - index); + } + + ra_size = query_page_cache(mapping, ra, &remain_pages, + index - 1, ra_min, ra_max); + + la_size = ra_index - index; + if (!readahead_live_chunk && + remain_pages <= la_size && la_size > 1) { + rescue_pages(page, la_size); + return -1; + } + + if (ra_size == index) { + if (!adjust_rala_accelerated(ra_max, &ra_size, &la_size)) + return -1; + set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED); + } else { + if (!adjust_rala(ra_max, &ra_size, &la_size)) + return -1; + set_ra_class(ra, RA_CLASS_CONTEXT); + } + + ra_state_init(ra, index, ra_index); + ra_state_update(ra, ra_size, la_size); + + return 1; +} + +/* + * Read-ahead on start of file. + * + * It is most important for small files. + * 1. Set a moderate large read-ahead size; + * 2. Issue the next read-ahead request as soon as possible. + * + * But be careful, there are some applications that dip into only the very head + * of a file. The most important thing is to prevent them from triggering the + * next (much larger) read-ahead request, which leads to lots of cache misses. + * Two pages should be enough for them, correct me if I'm wrong. + */ +static inline unsigned long +newfile_readahead(struct address_space *mapping, + struct file *filp, struct file_ra_state *ra, + unsigned long req_size, unsigned long ra_min) +{ + unsigned long ra_size; + unsigned long la_size; + + if (req_size > ra_min) + req_size = ra_min; + + if (unlikely(ra->flags & RA_FLAG_NFSD)) { + ra_size = MIN_NFSD_PAGES; + la_size = 0; + } else { + ra_size = 4 * req_size; + la_size = 2 * req_size; + } + + set_ra_class(ra, RA_CLASS_NEWFILE); + ra_state_init(ra, 0, 0); + ra_state_update(ra, ra_size, la_size); + + return ra_dispatch(ra, mapping, filp); +} + +/* + * Backward prefetching. + * No look ahead and thrashing threshold estimation for stepping backward + * pattern: should be unnecessary. + */ +static inline int +try_read_backward(struct file_ra_state *ra, + pgoff_t begin_index, pgoff_t end_index, + unsigned long ra_size, + unsigned long ra_min, unsigned long ra_max) +{ + if (ra_size > ra_max || end_index > ra->prev_page) + return 0; + + if (ra_has_index(ra, ra->prev_page)) { + if (end_index > ra->la_index) + return 0; + ra_size += 2 * ra_cache_hit(ra, 0); + end_index = ra->la_index; + } else { + ra_size += readahead_hit_rate + ra_min; + end_index = ra->prev_page; + } + + if (ra_size > ra_max) + ra_size = ra_max; + + if (end_index > begin_index + ra_size) + return 0; + + begin_index = end_index - ra_size; + + set_ra_class(ra, RA_CLASS_BACKWARD); + ra_state_init(ra, begin_index, begin_index); + ra_state_update(ra, ra_size, 0); + + return 1; +} + +/* + * If there is a previous sequential read, it is likely to be another + * sequential read at the new position. + * Databases are known to have this seek-and-read-one-record pattern. + */ +static inline int +try_random_readahead(struct file_ra_state *ra, pgoff_t index, + unsigned long ra_size, unsigned long ra_max) +{ + unsigned long hit0 = ra_cache_hit(ra, 0); + unsigned long hit1 = ra_cache_hit(ra, 1) + hit0; + unsigned long hit2 = ra_cache_hit(ra, 2); + unsigned long hit3 = ra_cache_hit(ra, 3); + + if (!ra_has_index(ra, ra->prev_page)) + return 0; + + if (index == ra->prev_page + 1) { /* read after thrashing */ + ra_size = hit0; + set_ra_class(ra, RA_CLASS_RANDOM_THRASHING); + ra_account(ra, RA_EVENT_READAHEAD_THRASHING, + ra->readahead_index - index); + } else if (ra_size < hit1 && /* read after seeking */ + hit1 > hit2 / 2 && + hit2 > hit3 / 2 && + hit3 > hit1 / 2) { + ra_size = max(hit1, hit2); + set_ra_class(ra, RA_CLASS_RANDOM_SEEK); + } else + return 0; + + if (ra_size > ra_max) + ra_size = ra_max; + + ra_state_init(ra, index, index); + ra_state_update(ra, ra_size, 0); + + return 1; +} + +/* + * Read-around for mmaped file. + */ +unsigned long +page_cache_readaround(struct address_space *mapping, struct file_ra_state *ra, + struct file *filp, pgoff_t index) +{ + unsigned long ra_index; + unsigned long ra_size = 0; + unsigned long hit_rate = 8 + readahead_hit_rate; + + if (ra->cache_hit * readahead_hit_rate > ra->age) + ra_size = ra->ra_pages; + else if (ra->cache_hit * hit_rate >= ra->age) + ra_size = ra->ra_pages / 4; + else if ((unsigned)(ra->prev_page - index) <= hit_rate) + ra_size = 4 * (ra->prev_page - index); + else if ((unsigned)(index - ra->prev_page) <= hit_rate) + ra_size = 4 * (index - ra->prev_page); + else { + cond_resched(); + read_lock_irq(&mapping->tree_lock); + if (radix_tree_lookup_node(&mapping->page_tree, index, 1)) + ra_size = RADIX_TREE_MAP_SIZE; + read_unlock_irq(&mapping->tree_lock); + } + + if (!ra_size) + return 0; + + ra_size = max_sane_readahead(ra_size); + + if (index > ra_size / 2) { + ra_index = index - ra_size / 2; + if (!(ra_size & RADIX_TREE_MAP_MASK)) + ra_index = (ra_index + RADIX_TREE_MAP_SIZE / 2) & + ~RADIX_TREE_MAP_MASK; + } else + ra_index = 0; + + set_ra_class(ra, RA_CLASS_AROUND); + ra->cache_hit = 0; + ra->ra_index = ra_index; + ra->la_index = ra_index; + ra->readahead_index = ra_index + ra_size; + ra->lookahead_index = ra_index + ra_size; + + ra->age = ra_dispatch(ra, mapping, filp); + return ra->age; +} + +/* + * ra_size is mainly determined by: + * 1. sequential-start: min(MIN_RA_PAGES + (pages>>14), KB(128)) + * 2. sequential-max: min(ra->ra_pages, 0xFFFF) + * 3. sequential: (thrashing-threshold) * readahead_ratio / 100 + * + * Table of concrete numbers for 4KB page size: + * (inactive + free) (in MB): 4 8 16 32 64 128 256 512 1024 + * initial ra_size (in KB): 16 16 16 16 20 24 32 48 64 + */ +static inline void get_readahead_bounds(struct file_ra_state *ra, + unsigned long *ra_min, + unsigned long *ra_max) +{ + unsigned long pages; + + pages = node_free_and_cold_pages(); + *ra_max = min(min(pages/2, 0xFFFFUL), ra->ra_pages); + *ra_min = min(min(MIN_RA_PAGES + (pages>>14), KB(128)), *ra_max/2); +} + +/* + * This is the entry point of the adaptive read-ahead logic. + * + * It is only called on two conditions: + * 1. page == NULL + * A cache miss happened, it can be either a random read or a sequential one. + * 2. page != NULL + * There is a look-ahead mark(PG_readahead) from a previous sequential read. + * It's time to do some checking and submit the next read-ahead IO. + * + * That has the merits of: + * - makes all stateful/stateless methods happy; + * - eliminates the cache hit problem naturally; + * - lives in harmony with application managed read-aheads via fadvise/madvise. + */ +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *prev_page, struct page *page, + pgoff_t begin_index, + pgoff_t index, pgoff_t end_index) +{ + unsigned long size; + unsigned long ra_min; + unsigned long ra_max; + int ret; + + if (page) { + if(!TestClearPageReadahead(page)) + return 0; + if (bdi_read_congested(mapping->backing_dev_info)) { + ra_account(ra, RA_EVENT_IO_CONGESTION, + end_index - index); + return 0; + } + if (laptop_mode && laptop_spinned_down()) { + if (!renew_lookahead(mapping, ra, index, + index + LAPTOP_POLL_INTERVAL)) + return 0; + } + } + + if (page) + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT, + ra->readahead_index - ra->lookahead_index); + else if (index) + ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index); + + size = end_index - index; + get_readahead_bounds(ra, &ra_min, &ra_max); + + /* readahead disabled? */ + if (unlikely(!ra_min || !readahead_ratio)) { + size = max_sane_readahead(size); + goto readit; + } + + /* + * Start of file. + */ + if (index == 0) + return newfile_readahead(mapping, filp, ra, end_index, ra_min); + + /* + * State based sequential read-ahead. + */ + if (!disable_stateful_method() && + index == ra->lookahead_index && + (page || index == ra->readahead_index) && + (ra_cache_hit_ok(ra) || + end_index - begin_index >= ra_max)) + return state_based_readahead(mapping, filp, ra, page, + size, ra_max); + + /* + * Backward read-ahead. + */ + if (try_read_backward(ra, begin_index, end_index, size, ra_min, ra_max)) + return ra_dispatch(ra, mapping, filp); + + /* + * Context based sequential read-ahead. + */ + ret = try_context_based_readahead(mapping, ra, prev_page, page, + index, size, ra_min, ra_max); + if (ret > 0) + return ra_dispatch(ra, mapping, filp); + if (ret < 0) + return 0; + + /* No action on look ahead time? */ + if (page) { + ra_account(ra, RA_EVENT_LOOKAHEAD_NOACTION, + ra->readahead_index - index); + return 0; + } + + /* + * Random read that follows a sequential one. + */ + if (try_random_readahead(ra, index, size, ra_max)) + return ra_dispatch(ra, mapping, filp); + + /* + * Random read. + */ + if (size > ra_max) + size = ra_max; + +readit: + size = __do_page_cache_readahead(mapping, filp, index, size, 0); + + ra_account(ra, RA_EVENT_READRANDOM, size); + dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n", + mapping->host->i_ino, mapping->nrpages, + begin_index, index, end_index, size); + + return size; +} + +/* + * Call me! + */ +void fastcall ra_access(struct file_ra_state *ra, struct page *page) +{ + if (page->flags & ((1 << PG_active) | + (1 << PG_activate) | + (1 << PG_referenced))) + return; + + if (!PageUptodate(page)) + ra_account(ra, RA_EVENT_IO_BLOCK, 1); + + if (!ra_has_index(ra, page->index)) + return; + + ra->cache_hit++; + + if (page->index >= ra->ra_index) + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1); + else + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1); +} + +/* + * Detect and protect live read-ahead pages. + * + * This function provides safty guarantee for file servers with big + * readahead_ratio set. The goal is to save all and only the sequential + * pages that are to be accessed in the near future. + * + * This function is called when pages in @page_list are to be freed, + * it protects live read-ahead pages by moving them into @save_list. + * + * The general idea is to classify pages of a file into groups of sequential + * accessed pages. Dead sequential pages are left over, live sequential pages + * are saved. + * + * Live read-ahead pages are defined as sequential pages that have reading in + * progress. They are detected by reference count pattern of: + * + * live head live pages + * ra pages group --> ------------___________________ + * [ pages to save ] (*) + * + * (*) for now, an extra page from the live head may also be saved. + * + * In pratical, the group of pages are fragmented into chunks. To tell whether + * pages inside a chunk are alive, we must check: + * 1) Are there any live heads inside the chunk? + * 2) Are there any live heads in the group before the chunk? + * 3) Sepcial case: live head just sits on the boundary of current chunk? + * + * The detailed rules employed must ensure: + * - no page is pinned in inactive_list. + * - no excessive pages are saved. + * + * A picture of common cases: + * back search chunk case + * -----___________|[____________________] Normal + * ----------------|----[________________] Normal + * |----[________________] Normal + * ----------------|---------------------- Normal + * |---------------------- Normal + * ________________|______________________ ra miss + * |______________________ ra miss + * ________________|_______--------[_____] two readers + * ----____________|[______--------______] two readers + * |_______--------[_____] two readers + * |----[____------______] two readers + * ----------------|----[____------______] two readers + * _______---------|---------------[_____] two readers + * ----___---------|[--------------______] two readers + * ________________|---------------[_____] two readers + * ----____________|[--------------______] two readers + * ====------------|[---_________________] two readers + * |====[----------______] two readers + * |###======[-----------] three readers + */ +static int save_chunk(struct page *head, struct page *live_head, + struct page *tail, struct list_head *save_list) +{ + struct page *page; + struct address_space *mapping; + struct radix_tree_cache cache; + int i; + pgoff_t index; + pgoff_t head_index; + unsigned long refcnt; + +#ifdef DEBUG_READAHEAD + static char static_buf[PAGE_SIZE]; + static char *zone_names[] = {"DMA", "DMA32", "Normal", "HighMem"}; + char *pat = static_buf; + int pidx = 0; +#define log_symbol(symbol) \ + do { \ + if (READAHEAD_DEBUG_LEVEL(3) && pidx < PAGE_SIZE - 1) \ + pat[pidx++] = symbol; \ + } while (0) + + if (READAHEAD_DEBUG_LEVEL(3)) { + pat = (char *)get_zeroed_page(GFP_KERNEL); + if (!pat) + pat = static_buf; + } +#else +#define log_symbol(symbol) do {} while (0) +#endif +#define log_page(page) log_symbol(page_refcnt_symbol(page)) + + head_index = head->index; + mapping = head->mapping; + radix_tree_cache_init(&cache); + + BUG_ON(!mapping); /* QUESTION: in what case mapping will be NULL ? */ + read_lock_irq(&mapping->tree_lock); + + /* + * Common case test. + * Does the far end indicates a leading live head? + */ + index = radix_tree_lookup_head(&mapping->page_tree, + head_index, readahead_live_chunk); + if (index >= head_index) + goto skip_scan_locked; + + page = __find_page(mapping, index); + BUG_ON(!page); + log_symbol('|'); + log_page(page); + refcnt = cold_page_refcnt(page); + if (head_index - index < readahead_live_chunk && + refcnt > page_refcnt(head)) { + live_head = head; + goto skip_scan_locked; + } + + /* + * The slow path. + * Scan page by page to see if the whole chunk should be saved. + */ + if (next_page(head) != tail) + head_index = next_page(head)->index; + else + head_index++; + for (i = 0, index++; index <= head_index; index++) { + page = radix_tree_cache_lookup(&mapping->page_tree, &cache, + index); + if (index == head->index) + log_symbol('|'); + log_page(page); + + if (!page) { + WARN_ON(index < head->index); + break; + } + + if (refcnt == page_refcnt(page)) + i++; + else if (refcnt < page_refcnt(page)) + i = 0; + else if (i < 1) + i = INT_MIN; + else { + live_head = head; + break; + } + + refcnt = page_refcnt(page); + } + +skip_scan_locked: +#ifdef DEBUG_READAHEAD + if (index < head->index) + log_symbol('*'); + index = prev_page(tail)->index; + + log_symbol('|'); + for (page = head; page != tail; page = next_page(page)) { + BUG_ON(PageAnon(page)); + BUG_ON(PageSwapCache(page)); + /* BUG_ON(page_mapped(page)); */ + + if (page == live_head) + log_symbol('['); + log_page(page); + } + if (live_head) + log_symbol(']'); +#endif + + /* + * Special case work around. + * + * Save one extra page if it is a live head of the following chunk. + * Just to be safe. It protects the rare situation when the reader + * is just crossing the chunk boundary, and the following chunk is not + * far away from tail of inactive_list. + * + * The special case is awkwardly delt with for now. They will be all set + * when the timing information of recently evicted pages are available. + * Dead pages can also be purged earlier with the timing info. + */ + if (live_head != head) { + struct page *last_page = prev_page(tail); + page = radix_tree_cache_lookup(&mapping->page_tree, &cache, + last_page->index + 1); + log_symbol('|'); + log_page(page); + if (page && !live_head) { + refcnt = page_refcnt(last_page); + if (page_refcnt(page) >= refcnt) + page = radix_tree_cache_lookup( + &mapping->page_tree, &cache, + last_page->index + 2); + log_page(page); + if (page && page_refcnt(page) < refcnt) { + live_head = last_page; + log_symbol('I'); + } + } else if (!page && live_head) { + live_head = next_page(live_head); + log_symbol('D'); + } + } + log_symbol('\0'); + + read_unlock_irq(&mapping->tree_lock); + + /* + * Now save the alive pages. + */ + i = 0; + if (live_head) { + for (; live_head != tail;) { /* never dereference tail! */ + page = next_page(live_head); + if (!PageActivate(live_head)) { + list_move(&live_head->lru, save_list); + i++; + if (!page_refcnt(live_head)) + inc_readahead_aging(); + } + live_head = page; + } + + if (i) + ra_account(0, RA_EVENT_READAHEAD_RESCUE, i); + } + +#ifdef DEBUG_READAHEAD + if (READAHEAD_DEBUG_LEVEL(3)) { + ddprintk("save_chunk(ino=%lu, idx=%lu-%lu, %s@%s:%s)" + " = %d\n", + mapping->host->i_ino, + head->index, index, + mapping_mapped(mapping) ? "mmap" : "file", + zone_names[page_zonenum(head)], pat, i); + if (pat != static_buf) + free_page((unsigned long)pat); + } +#endif + + return i; +} + +int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list) +{ + struct address_space *mapping; + struct page *chunk_head; + struct page *live_head; + struct page *page; + unsigned long refcnt; + unsigned long min_refcnt; + int n; + int ret = 0; + + page = list_to_page(page_list); + +next_chunk: + chunk_head = page; + live_head = NULL; + mapping = page->mapping; + n = 0; + min_refcnt = LONG_MAX; + +next_head_page: + refcnt = page_refcnt(page); + if (min_refcnt > refcnt) + min_refcnt = refcnt; + page = next_page(page); + + if (mapping != page->mapping || &page->lru == page_list) + goto save_chunk; + + /* At least 2 pages followed by a fall in refcnt makes a live head: + * --_ + * ^ live_head + */ + if (refcnt == page_refcnt(page)) + n++; + else if (refcnt < page_refcnt(page)) + n = 0; + else if (n < 1) + n = INT_MIN; + else + goto got_live_head; + + goto next_head_page; + +got_live_head: + n = 0; + live_head = prev_page(page); + +next_page: + if (refcnt < page_refcnt(page)) /* limit the number of rises */ + n++; + refcnt = page_refcnt(page); + if (min_refcnt > refcnt) + min_refcnt = refcnt; + page = next_page(page); + + if (mapping != page->mapping || &page->lru == page_list) + goto save_chunk; + + goto next_page; + +save_chunk: + if (mapping && !PageAnon(chunk_head) && + !PageSwapCache(chunk_head) && + /* !page_mapped(chunk_head) && */ + min_refcnt < PAGE_REFCNT_2 && + n <= 3) + ret += save_chunk(chunk_head, live_head, page, save_list); + + cond_resched(); + if (&page->lru != page_list) + goto next_chunk; + + return ret; +} Index: linux-2.6.15-rc5-ck1/mm/filemap.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/mm/filemap.c +++ linux-2.6.15-rc5-ck1/mm/filemap.c @@ -729,10 +729,12 @@ void do_generic_mapping_read(struct addr unsigned long prev_index; loff_t isize; struct page *cached_page; + struct page *prev_page; int error; struct file_ra_state ra = *_ra; cached_page = NULL; + prev_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; prev_index = ra.prev_page; @@ -743,6 +745,10 @@ void do_generic_mapping_read(struct addr if (!isize) goto out; + if (READAHEAD_DEBUG_LEVEL(5)) + printk(KERN_DEBUG "read-file(ino=%lu, req=%lu+%lu)\n", + inode->i_ino, index, last_index - index); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; @@ -761,16 +767,42 @@ void do_generic_mapping_read(struct addr nr = nr - offset; cond_resched(); - if (index == next_index) + + if (!prefer_adaptive_readahead() && index == next_index) next_index = page_cache_readahead(mapping, &ra, filp, index, last_index - index); find_page: page = find_get_page(mapping, index); + if (prefer_adaptive_readahead()) { + if (unlikely(page == NULL)) { + page_cache_readahead_adaptive(mapping, &ra, + filp, prev_page, NULL, + *ppos >> PAGE_CACHE_SHIFT, + index, last_index); + page = find_get_page(mapping, index); + } else if (PageReadahead(page)) { + page_cache_readahead_adaptive(mapping, &ra, + filp, prev_page, page, + *ppos >> PAGE_CACHE_SHIFT, + index, last_index); + } + } if (unlikely(page == NULL)) { - handle_ra_miss(mapping, &ra, index); + if (!prefer_adaptive_readahead()) + handle_ra_miss(mapping, &ra, index); goto no_cached_page; } + if (prev_page) + page_cache_release(prev_page); + prev_page = page; + + ra_access(&ra, page); + if (READAHEAD_DEBUG_LEVEL(7)) + printk(KERN_DEBUG "read-file(ino=%lu, idx=%lu, io=%s)\n", + inode->i_ino, index, + PageUptodate(page) ? "hit" : "miss"); + if (!PageUptodate(page)) goto page_not_up_to_date; page_ok: @@ -805,7 +837,6 @@ page_ok: index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; - page_cache_release(page); if (ret == nr && desc->count) continue; goto out; @@ -817,7 +848,6 @@ page_not_up_to_date: /* Did it get unhashed before we got the lock? */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); continue; } @@ -842,7 +872,6 @@ readpage: * invalidate_inode_pages got it */ unlock_page(page); - page_cache_release(page); goto find_page; } unlock_page(page); @@ -863,7 +892,6 @@ readpage: isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) { - page_cache_release(page); goto out; } @@ -872,7 +900,6 @@ readpage: if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { - page_cache_release(page); goto out; } } @@ -882,7 +909,6 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; - page_cache_release(page); goto out; no_cached_page: @@ -907,15 +933,22 @@ no_cached_page: } page = cached_page; cached_page = NULL; + if (prev_page) + page_cache_release(prev_page); + prev_page = page; goto readpage; } out: *_ra = ra; + if (prefer_adaptive_readahead()) + _ra->prev_page = prev_index; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) page_cache_release(cached_page); + if (prev_page) + page_cache_release(prev_page); if (filp) file_accessed(filp); } @@ -1194,8 +1227,9 @@ struct page *filemap_nopage(struct vm_ar struct inode *inode = mapping->host; struct page *page; unsigned long size, pgoff; - int did_readaround = 0, majmin = VM_FAULT_MINOR; + int majmin = VM_FAULT_MINOR; + ra->flags |= RA_FLAG_MMAP; pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; retry_all: @@ -1213,54 +1247,48 @@ retry_all: * * For sequential accesses, we use the generic readahead logic. */ - if (VM_SequentialReadHint(area)) + if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area)) page_cache_readahead(mapping, ra, file, pgoff, 1); + /* * Do we have something in the page cache already? */ retry_find: page = find_get_page(mapping, pgoff); + if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) { + if (!page) { + page_cache_readahead_adaptive(mapping, ra, + file, NULL, NULL, + pgoff, pgoff, pgoff + 1); + page = find_get_page(mapping, pgoff); + } else if (PageReadahead(page)) { + page_cache_readahead_adaptive(mapping, ra, + file, NULL, page, + pgoff, pgoff, pgoff + 1); + } + } if (!page) { - unsigned long ra_pages; - if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); + if (!prefer_adaptive_readahead()) + handle_ra_miss(mapping, ra, pgoff); goto no_cached_page; } - ra->mmap_miss++; - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) - goto no_cached_page; + page_cache_readaround(mapping, ra, file, pgoff); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (pgoff > ra_pages / 2) - start = pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); - } page = find_get_page(mapping, pgoff); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_hit++; + ra_access(ra, page); + if (READAHEAD_DEBUG_LEVEL(6)) + printk(KERN_DEBUG "read-mmap(ino=%lu, idx=%lu, hint=%s, io=%s)\n", + inode->i_ino, pgoff, + VM_RandomReadHint(area) ? "random" : + (VM_SequentialReadHint(area) ? "sequential" : "none"), + PageUptodate(page) ? "hit" : "miss"); /* * Ok, found a page in the page cache, now we need to check @@ -1276,6 +1304,8 @@ success: mark_page_accessed(page); if (type) *type = majmin; + if (prefer_adaptive_readahead() || !VM_SequentialReadHint(area)) + ra->prev_page = page->index; return page; outside_data_content: @@ -1312,10 +1342,9 @@ no_cached_page: return NULL; page_not_uptodate: - if (!did_readaround) { - majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); - } + majmin = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + lock_page(page); /* Did it get unhashed while we waited for it? */ Index: linux-2.6.15-rc5-ck1/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.15-rc5-ck1.orig/Documentation/sysctl/vm.txt +++ linux-2.6.15-rc5-ck1/Documentation/sysctl/vm.txt @@ -27,6 +27,9 @@ Currently, these files are in /proc/sys/ - laptop_mode - block_dump - swap_prefetch +- readahead_ratio +- readahead_hit_rate +- readahead_live_chunk ============================================================== @@ -114,3 +117,46 @@ except when laptop_mode is enabled and t Setting it to 0 disables prefetching entirely. The default value is dependant on ramsize. + +============================================================== + +readahead_ratio + +This limits read-ahead size to percent of the thrashing-threshold. +The thrashing-threshold is dynamicly estimated according to the +_history_ read speed and system load, and used to limit the +_future_ read-ahead request size. + +Set it to a low value if you have not enough memory to counteract +the I/O load fluctuations. But if there's plenty of memory, set it +to a larger value might help increase read speed. + +The default value is 50. + +============================================================== + +readahead_hit_rate + +This is the max allowed value of (read-ahead-pages : accessed-pages). +If the previous read-ahead request has bad hit rate, kernel will be +very conservative to issue the next read-ahead. + +A large value helps speedup some sparse access patterns, at the cost +of more memory consumption. It is recommended to keep the value below +(max-readahead-pages / 8). + +The default value is 2. + +============================================================== + +readahead_live_chunk + +In a file server, there are typically one or more sequential +readers working on a file. The kernel can detect most live +chunks(a sequence of pages to be accessed by an active reader), +and save them for their imminent readers. + +This parameter controls the max allowed chunk size, i.e. the max +number of pages pinned for an active reader. + +The default value is 0(off). Increase it if you have enough memory. Index: linux-2.6.15-rc5-ck1/include/linux/sysctl.h =================================================================== --- linux-2.6.15-rc5-ck1.orig/include/linux/sysctl.h +++ linux-2.6.15-rc5-ck1/include/linux/sysctl.h @@ -185,6 +185,9 @@ enum VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ VM_SWAP_PREFETCH=29, /* int: amount to swap prefetch */ VM_HARDMAPLIMIT=30, /* Make mapped a hard limit */ + VM_READAHEAD_RATIO=31, /* percent of read-ahead size to thrashing-threshold */ + VM_READAHEAD_HIT_RATE=32, /* one accessed page legitimizes so many read-ahead pages */ + VM_READAHEAD_LIVE_CHUNK=33, /* pin no more than that many pages for a live reader */ }; Index: linux-2.6.15-rc5-ck1/kernel/sysctl.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/kernel/sysctl.c +++ linux-2.6.15-rc5-ck1/kernel/sysctl.c @@ -67,6 +67,9 @@ extern int min_free_kbytes; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; +extern int readahead_ratio; +extern int readahead_hit_rate; +extern int readahead_live_chunk; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -227,6 +230,7 @@ static ctl_table root_table[] = { /* Constants for minimum and maximum testing. We use these as one-element integer vectors. */ static int zero; +static int one = 1; static int one_hundred = 100; static ctl_table kern_table[] = { @@ -895,6 +899,36 @@ static ctl_table vm_table[] = { }, #endif #endif + { + .ctl_name = VM_READAHEAD_RATIO, + .procname = "readahead_ratio", + .data = &readahead_ratio, + .maxlen = sizeof(readahead_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_READAHEAD_HIT_RATE, + .procname = "readahead_hit_rate", + .data = &readahead_hit_rate, + .maxlen = sizeof(readahead_hit_rate), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { + .ctl_name = VM_READAHEAD_LIVE_CHUNK, + .procname = "readahead_live_chunk", + .data = &readahead_live_chunk, + .maxlen = sizeof(readahead_live_chunk), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, { .ctl_name = 0 } }; Index: linux-2.6.15-rc5-ck1/include/linux/fs.h =================================================================== --- linux-2.6.15-rc5-ck1.orig/include/linux/fs.h +++ linux-2.6.15-rc5-ck1/include/linux/fs.h @@ -571,19 +571,30 @@ struct fown_struct { * Track a single file's readahead state */ struct file_ra_state { + /* conventional read-ahead */ unsigned long start; /* Current window */ unsigned long size; - unsigned long flags; /* ra flags RA_FLAG_xxx*/ - unsigned long cache_hit; /* cache hit count*/ - unsigned long prev_page; /* Cache last read() position */ unsigned long ahead_start; /* Ahead window */ unsigned long ahead_size; - unsigned long ra_pages; /* Maximum readahead window */ - unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ - unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ + + /* adaptive read-ahead */ + unsigned long age; + pgoff_t la_index; + pgoff_t ra_index; + pgoff_t lookahead_index; + pgoff_t readahead_index; + + /* shared */ + uint64_t cache_hit; /* cache hit count*/ + unsigned long flags; /* ra flags RA_FLAG_xxx*/ + unsigned long prev_page; /* Cache last read() position */ + unsigned long ra_pages; /* Maximum readahead window */ }; #define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */ #define RA_FLAG_INCACHE 0x02 /* file is already in cache */ +#define RA_FLAG_MMAP (1UL<<31) /* mmaped page access */ +#define RA_FLAG_NO_LOOKAHEAD (1UL<<30) /* disable look-ahead */ +#define RA_FLAG_NFSD (1UL<<29) /* request from nfsd */ struct file { /* Index: linux-2.6.15-rc5-ck1/mm/memory.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/mm/memory.c +++ linux-2.6.15-rc5-ck1/mm/memory.c @@ -1982,6 +1982,7 @@ static int do_anonymous_page(struct mm_s page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; + inc_readahead_aging(); inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); SetPageReferenced(page); Index: linux-2.6.15-rc5-ck1/include/linux/writeback.h =================================================================== --- linux-2.6.15-rc5-ck1.orig/include/linux/writeback.h +++ linux-2.6.15-rc5-ck1/include/linux/writeback.h @@ -90,6 +90,12 @@ void laptop_io_completion(void); void laptop_sync_completion(void); void throttle_vm_writeout(void); +extern struct timer_list laptop_mode_wb_timer; +static inline int laptop_spinned_down(void) +{ + return !timer_pending(&laptop_mode_wb_timer); +} + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern int vm_dirty_ratio; Index: linux-2.6.15-rc5-ck1/mm/page-writeback.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/mm/page-writeback.c +++ linux-2.6.15-rc5-ck1/mm/page-writeback.c @@ -369,7 +369,7 @@ static void wb_timer_fn(unsigned long un static void laptop_timer_fn(unsigned long unused); static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); +DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* * Periodic writeback of "old" data. Index: linux-2.6.15-rc5-ck1/drivers/block/loop.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/drivers/block/loop.c +++ linux-2.6.15-rc5-ck1/drivers/block/loop.c @@ -769,6 +769,12 @@ static int loop_set_fd(struct loop_devic mapping = file->f_mapping; inode = mapping->host; + /* + * The upper layer should already do proper look-ahead, + * one more look-ahead here only ruins the cache hit rate. + */ + file->f_ra.flags |= RA_FLAG_NO_LOOKAHEAD; + if (!(file->f_mode & FMODE_WRITE)) lo_flags |= LO_FLAGS_READ_ONLY; Index: linux-2.6.15-rc5-ck1/fs/nfsd/vfs.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/fs/nfsd/vfs.c +++ linux-2.6.15-rc5-ck1/fs/nfsd/vfs.c @@ -838,10 +838,14 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st #endif /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + if (prefer_adaptive_readahead()) + ra = NULL; + else + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); if (ra && ra->p_set) file->f_ra = ra->p_ra; + file->f_ra.flags |= RA_FLAG_NFSD; if (file->f_op->sendfile) { svc_pushback_unused_pages(rqstp); Index: linux-2.6.15-rc5-ck1/fs/mpage.c =================================================================== --- linux-2.6.15-rc5-ck1.orig/fs/mpage.c +++ linux-2.6.15-rc5-ck1/fs/mpage.c @@ -343,8 +343,10 @@ mpage_readpages(struct address_space *ma bio = do_mpage_readpage(bio, page, nr_pages - page_idx, &last_block_in_bio, get_block); - if (!pagevec_add(&lru_pvec, page)) + if (!pagevec_add(&lru_pvec, page)) { __pagevec_lru_add(&lru_pvec); + cond_resched(); + } } else { page_cache_release(page); }