When reading from large files through the generic file read functions into
page cache we can detect when a file is so large that it is unlikely to be
fully cached in ram.

Add a tunable /proc/sys/vm/tail_largefiles that puts them at the tail of the
inactive list to minimise their harm on present mapped pages and pagecache
and enable it by default.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

---
 Documentation/filesystems/proc.txt |    8 +++++
 Documentation/sysctl/vm.txt        |    2 -
 kernel/sysctl.c                    |    9 ++++++
 mm/filemap.c                       |   53 +++++++++++++++++++++++++++++++++++--
 mm/swap.c                          |    3 --
 5 files changed, 70 insertions(+), 5 deletions(-)

Index: linux-2.6.22-rc4-ck1/mm/filemap.c
===================================================================
--- linux-2.6.22-rc4-ck1.orig/mm/filemap.c	2007-06-10 21:58:50.000000000 +1000
+++ linux-2.6.22-rc4-ck1/mm/filemap.c	2007-06-10 21:59:57.000000000 +1000
@@ -466,6 +466,16 @@ int add_to_page_cache_lru(struct page *p
 	return ret;
 }
 
+int add_to_page_cache_lru_tail(struct page *page,
+	struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
+{
+	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+
+	if (ret == 0)
+		lru_cache_add_tail(page);
+	return ret;
+}
+
 #ifdef CONFIG_NUMA
 struct page *__page_cache_alloc(gfp_t gfp)
 {
@@ -839,6 +849,34 @@ static void shrink_readahead_size_eio(st
 	ra->ra_pages /= 4;
 }
 
+/*
+ * Sysctl which determines whether we should read from large files to the
+ * tail of the inactive lru list.
+ */
+int vm_tail_largefiles __read_mostly = 1;
+
+static inline int nr_mapped(void)
+{
+	return global_page_state(NR_FILE_MAPPED) +
+		global_page_state(NR_ANON_PAGES);
+}
+
+/*
+ * This examines how large in pages a file size is and returns 1 if it is
+ * more than half the unmapped ram. Avoid doing read_page_state which is
+ * expensive unless we already know it is likely to be large enough.
+ */
+static int large_isize(unsigned long nr_pages)
+{
+	if (nr_pages * 6 > vm_total_pages) {
+		 unsigned long unmapped_ram = vm_total_pages - nr_mapped();
+
+		if (nr_pages * 2 > unmapped_ram)
+			return 1;
+	}
+	return 0;
+}
+
 /**
  * do_generic_mapping_read - generic file read routine
  * @mapping:	address_space to be read
@@ -1051,8 +1089,19 @@ no_cached_page:
 				goto out;
 			}
 		}
-		error = add_to_page_cache_lru(cached_page, mapping,
-						index, GFP_KERNEL);
+
+		/*
+		 * If we know the file is large we add the pages read to the
+		 * end of the lru as we're unlikely to be able to cache the
+		 * whole file in ram so make those pages the first to be
+		 * dropped if not referenced soon.
+		 */
+		if (vm_tail_largefiles && large_isize(end_index))
+			error = add_to_page_cache_lru_tail(cached_page,
+						mapping, index, GFP_KERNEL);
+		else
+			error = add_to_page_cache_lru(cached_page, mapping,
+							index, GFP_KERNEL);
 		if (error) {
 			if (error == -EEXIST)
 				goto find_page;
Index: linux-2.6.22-rc4-ck1/mm/swap.c
===================================================================
--- linux-2.6.22-rc4-ck1.orig/mm/swap.c	2007-06-10 21:59:55.000000000 +1000
+++ linux-2.6.22-rc4-ck1/mm/swap.c	2007-06-10 21:59:57.000000000 +1000
@@ -434,8 +434,7 @@ void __pagevec_lru_add_active(struct pag
 
 /*
  * Function used uniquely to put pages back to the lru at the end of the
- * inactive list to preserve the lru order. Currently only used by swap
- * prefetch.
+ * inactive list to preserve the lru order.
  */
 void fastcall lru_cache_add_tail(struct page *page)
 {
Index: linux-2.6.22-rc4-ck1/kernel/sysctl.c
===================================================================
--- linux-2.6.22-rc4-ck1.orig/kernel/sysctl.c	2007-06-10 21:59:55.000000000 +1000
+++ linux-2.6.22-rc4-ck1/kernel/sysctl.c	2007-06-10 21:59:57.000000000 +1000
@@ -71,6 +71,7 @@ extern int suid_dumpable;
 extern char core_pattern[];
 extern int pid_max;
 extern int min_free_kbytes;
+extern int vm_tail_largefiles;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
@@ -675,6 +676,14 @@ static ctl_table kern_table[] = {
 
 static ctl_table vm_table[] = {
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "tail_largefiles",
+		.data		= &vm_tail_largefiles,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
 		.procname	= "overcommit_memory",
 		.data		= &sysctl_overcommit_memory,
Index: linux-2.6.22-rc4-ck1/Documentation/filesystems/proc.txt
===================================================================
--- linux-2.6.22-rc4-ck1.orig/Documentation/filesystems/proc.txt	2007-06-10 21:58:51.000000000 +1000
+++ linux-2.6.22-rc4-ck1/Documentation/filesystems/proc.txt	2007-06-10 21:59:57.000000000 +1000
@@ -1333,6 +1333,14 @@ To free pagecache, dentries and inodes:
 As this is a non-destructive operation and dirty objects are not freeable, the
 user should run `sync' first.
 
+tail_largefiles
+---------------
+
+When enabled reads from large files to the tail end of the inactive lru list.
+This means that any cache from reading large files is dropped very quickly,
+preventing loss of mapped ram and useful pagecache when large files are read.
+This does, however, make caching less effective when working with large files.
+
 
 2.5 /proc/sys/dev - Device specific parameters
 ----------------------------------------------
Index: linux-2.6.22-rc4-ck1/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.22-rc4-ck1.orig/Documentation/sysctl/vm.txt	2007-06-10 21:59:55.000000000 +1000
+++ linux-2.6.22-rc4-ck1/Documentation/sysctl/vm.txt	2007-06-10 21:59:57.000000000 +1000
@@ -39,7 +39,7 @@ Currently, these files are in /proc/sys/
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
 dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout, drop-caches:
+block_dump, swap_token_timeout, drop-caches, tail_largefiles:
 
 See Documentation/filesystems/proc.txt
 
