Index: linux-2.6.8.1-ck/Documentation/filesystems/reiser4.txt
===================================================================
--- linux-2.6.8.1-ck.orig/Documentation/filesystems/reiser4.txt	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/Documentation/filesystems/reiser4.txt	2004-08-22 19:35:33.600656044 +1000
@@ -0,0 +1,97 @@
+Reiser4 filesystem
+==================
+Reiser4 is a file system based on dancing tree algorithms, and is
+described at http://www.namesys.com
+
+
+References
+==========
+web page		http://namesys.com/v4/v4.html
+source code and
+userland tools		http://thebsh.namesys.com/snapshots/LATEST
+
+
+Compile options
+===============
+Use larger keys on reiser4 tree
+	Make keys larger and use additional bits to order bodies of
+	files within a directory in the order of their names, which is
+	what you want normally. If you turn this off, file bodies will
+	be ordered by creation time, which is not optimal for most
+	users.
+
+	Turn this off to mount filesystems created with non-default key plugin:
+	mkfs.reiser4 -o key=key_short /dev/hdb6
+	You can find which key plugin your filesystem is created with:
+	debugfs.reiser4 /dev/hdb6 | grep "key policy"
+	If it says:
+	key policy:	SHORT
+	then you need to turn this option OFF
+	If it says
+	key policy:	LARGE
+	then you have to turn this ON
+	Note, that currently you can mount either only filesystems
+	with large keys or only filesystems with small keys.
+
+Enable reiser4 debug options
+       It makes available a lot of various options. They are supposed
+       to be used for debugging/profiling purposes. Make menuconfig
+       provides their thorough documentation.
+
+
+Mount options
+=============
+tmgr.atom_max_size=N
+	Atoms containing more than N blocks will be forced to commit.
+	N is decimal.
+	Default is nr_free_pagecache_pages() / 2 at mount time.
+
+tmgr.atom_max_age=N
+	Atoms older than N seconds will be forced to commit. N is decimal.
+	Default is 600.
+
+tmgr.atom_max_flushers=N
+	Limit of concurrent flushers for one atom. 0 means no limit.
+	Default is 0.
+
+tree.cbk_cache.nr_slots=N
+	Number of slots in the cbk cache.
+
+flush.relocate_threshold=N
+	If flush finds more than N adjacent dirty leaf-level blocks it
+	will force them to be relocated.
+	Default is 64.
+
+flush.relocate_distance=N
+	If flush finds can find a block allocation closer than at most
+	N from the preceder it will relocate to that position.
+	Default is 64.
+
+flush.scan_maxnodes=N
+	The maximum number of nodes to scan left on a level during
+	flush.
+	Default is 10000.
+
+optimal_io_size=N
+	Preferred IO size. This value is used to set st_blksize of
+	struct stat.
+	Default is 65536.
+
+bsdgroups
+	Turn on BSD-style gid assignment.
+
+32bittimes
+	By default file in reiser4 have 64 bit timestamps. Files
+	created when filesystem is mounted with 32bittimes mount
+	option will get 32 bit timestamps.
+
+mtflush
+	Turn off concurrent flushing.
+
+nopseudo
+	Disable pseudo files support. See
+	http://namesys.com/v4/pseudo.html for more about pseudo files.
+
+dont_load_bitmap
+	Don't load all bitmap blocks at mount time, it is useful for
+	machines with tiny RAM and large disks.
Index: linux-2.6.8.1-ck/fs/fs-writeback.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/fs-writeback.c	2004-08-22 19:35:02.536608053 +1000
+++ linux-2.6.8.1-ck/fs/fs-writeback.c	2004-08-22 19:35:33.601655885 +1000
@@ -289,7 +289,7 @@ __writeback_single_inode(struct inode *i
  * throttled threads: we don't want them all piling up on __wait_on_inode.
  */
 static void
-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
@@ -369,6 +369,15 @@ sync_sb_inodes(struct super_block *sb, s
 	return;		/* Leave any unwritten inodes on s_io */
 }
 
+static void
+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+{
+	if (sb->s_op->sync_inodes)
+		sb->s_op->sync_inodes(sb, wbc);
+	else
+		generic_sync_sb_inodes(sb, wbc);
+}
+
 /*
  * Start writeback of dirty pagecache data against all unlocked inodes.
  *
Index: linux-2.6.8.1-ck/fs/inode.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/inode.c	2004-08-22 19:35:30.985072954 +1000
+++ linux-2.6.8.1-ck/fs/inode.c	2004-08-22 19:35:33.602655725 +1000
@@ -81,6 +81,7 @@ static struct hlist_head *inode_hashtabl
  * the i_state of an inode while it is in use..
  */
 spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+EXPORT_SYMBOL(inode_lock);
 
 /*
  * iprune_sem provides exclusion between the kswapd or try_to_free_pages
@@ -96,6 +97,7 @@ DECLARE_MUTEX(iprune_sem);
  * Statistics gathering..
  */
 struct inodes_stat_t inodes_stat;
+EXPORT_SYMBOL(inodes_stat);
 
 static kmem_cache_t * inode_cachep;
 
@@ -180,7 +182,7 @@ void destroy_inode(struct inode *inode) 
 	else
 		kmem_cache_free(inode_cachep, (inode));
 }
-
+EXPORT_SYMBOL(destroy_inode);
 
 /*
  * These are initializations that only need to be done
@@ -232,6 +234,7 @@ void __iget(struct inode * inode)
 		list_move(&inode->i_list, &inode_in_use);
 	inodes_stat.nr_unused--;
 }
+EXPORT_SYMBOL(__iget);
 
 /**
  * clear_inode - clear an inode
@@ -1023,7 +1026,7 @@ void generic_delete_inode(struct inode *
 
 EXPORT_SYMBOL(generic_delete_inode);
 
-static void generic_forget_inode(struct inode *inode)
+void generic_forget_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
@@ -1049,6 +1052,7 @@ static void generic_forget_inode(struct 
 	clear_inode(inode);
 	destroy_inode(inode);
 }
+EXPORT_SYMBOL(generic_forget_inode);
 
 /*
  * Normal UNIX filesystem behaviour: delete the
@@ -1321,6 +1325,7 @@ void wake_up_inode(struct inode *inode)
 	if (waitqueue_active(wq))
 		wake_up_all(wq);
 }
+EXPORT_SYMBOL(wake_up_inode);
 
 static __initdata unsigned long ihash_entries;
 static int __init set_ihash_entries(char *str)
Index: linux-2.6.8.1-ck/fs/Kconfig
===================================================================
--- linux-2.6.8.1-ck.orig/fs/Kconfig	2004-08-22 19:35:18.616044630 +1000
+++ linux-2.6.8.1-ck/fs/Kconfig	2004-08-22 19:35:33.603655566 +1000
@@ -160,6 +160,8 @@ config FS_MBCACHE
 	default y if EXT2_FS=y || EXT3_FS=y
 	default m if EXT2_FS=m || EXT3_FS=m
 
+source "fs/Kconfig.reiser4"
+
 config REISERFS_FS
 	tristate "Reiserfs support"
 	help
Index: linux-2.6.8.1-ck/fs/Kconfig.reiser4
===================================================================
--- linux-2.6.8.1-ck.orig/fs/Kconfig.reiser4	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/Kconfig.reiser4	2004-08-22 19:35:33.604655407 +1000
@@ -0,0 +1,162 @@
+config REISER4_FS
+	tristate "Reiser4 (EXPERIMENTAL very fast general purpose filesystem)"
+	depends on EXPERIMENTAL
+	default y
+	---help---
+	  Reiser4 is more than twice as fast for both reads and writes as
+	  ReiserFS V3, and is the fastest Linux filesystem, by a lot,
+	  for typical IO intensive workloads.  [It is slow at fsync
+	  intensive workloads as it is not yet optimized for fsync
+	  (sponsors are welcome for that work), and it is instead
+	  optimized for atomicity, see below.]  Benchmarks that define
+	  "a lot" are at http://www.namesys.com/benchmarks.html.
+
+	  It is the storage layer of what will become a general purpose naming
+	  system --- like what Microsoft wants WinFS to be except designed with a
+	  clean new semantic layer rather than being SQL based like WinFS.
+	  For details read http://www.namesys.com/whitepaper.html
+
+	  It performs all filesystem operations as atomic transactions, which
+	  means that it either performs a write, or it does not, and in the
+	  event of a crash it does not partially perform it or corrupt it.
+	  Many applications that currently use fsync don't need to if they use
+	  reiser4, and that means a lot for performance.  An API for performing
+	  multiple file system operations as one high performance atomic write
+	  is almost finished.
+
+	  It stores files in dancing trees, which are like balanced trees but
+	  faster.  It packs small files together so that they share blocks
+	  without wasting space.  This means you can use it to store really
+	  small files.  It also means that it saves you disk space.  It avoids
+	  hassling you with anachronisms like having a maximum number of
+	  inodes, and wasting space if you use less than that number.
+
+	  It can handle really large directories, because its search
+	  algorithms are logarithmic with size not linear.  With Reiser4 you
+	  should use subdirectories because they help YOU, not because they
+	  help your filesystem's performance, or because your filesystem won't
+	  be able to shrink a directory once you have let it grow.  For squid
+	  and similar applications, everything in one directory should perform
+	  better.
+
+	  It has a plugin-based infrastructure, which means that you can easily
+	  invent new kinds of files, and so can other people, so it will evolve
+	  rapidly.
+
+	  We will be adding a variety of security features to it that DARPA has
+	  funded us to write.
+
+	  "reiser4" is a distinct filesystem mount type from "reiserfs" (V3),
+	  which means that "reiserfs" filesystems will be unaffected by any
+	  reiser4 bugs.
+
+	  ReiserFS V3 is the stablest Linux filesystem, and V4 is the fastest.
+
+	  In regards to claims by ext2 that they are the de facto
+	  standard Linux filesystem, the most polite thing to say is that
+          many persons disagree, and it is interesting that those persons
+	  seem to include the distros that are growing in market share.
+	  See http://www.namesys.com/benchmarks.html for why many disagree.
+
+          If you'd like to upgrade from reiserfs to reiser4, use tar to a
+	  temporary disk, maybe using NFS/ssh/SFS to get to that disk, or ask
+	  your favorite distro to sponsor writing a conversion program.
+
+	  Sponsored by the Defensed Advanced Research Projects Agency (DARPA)
+	  of the United States Government.  DARPA does not endorse this
+	  project, it merely sponsors it.
+	  See http://www.darpa.mil/ato/programs/chats.htm
+
+	  If you would like to learn about our plans to add
+	  military grade security to reiser4, please read
+	  http://www.namesys.com/blackbox_security.html.
+
+	  To learn more about reiser4, go to http://www.namesys.com
+
+config REISER4_LARGE_KEY
+	bool "Use larger keys on reiser4 tree"
+	depends on REISER4_FS
+	default y
+	---help---
+      Make keys larger and use additional bits to order bodies of files within
+      a directory in the order of their names, which is what you want
+      normally. If you turn this off, file bodies will be ordered by creation
+      time, which is not optimal for most users.
+
+      Warning: flipping this option makes your file system binary
+      incompatible.
+
+config REISER4_CHECK
+	bool "Enable reiser4 debug options"
+	depends on REISER4_FS
+	---help---
+	  Don't use this unless you are a developer debugging reiser4.  If
+	  using a kernel made by a distro that thinks they are our competitor
+	  (sigh) rather than made by Linus, always check each release to make
+	  sure they have not turned this on to make us look slow as was done
+	  once in the past.  This checks everything imaginable while reiser4
+	  runs.
+
+	  When adding features to reiser4 you should set this, and then
+	  extensively test the code, and then send to us and we will test it
+	  again.  Include a description of what you did to test it.  All
+	  reiser4 code must be tested, reviewed, and signed off on by two
+	  persons before it will be accepted into a stable kernel by Hans.
+
+config REISER4_DEBUG
+	bool "Assertions"
+	depends on REISER4_CHECK && REISER4_FS!=m
+	help
+	  Turns on assertions checks. Eats a lot of CPU.
+
+config REISER4_DEBUG_MODIFY
+	bool "Dirtying"
+	depends on REISER4_CHECK
+	help
+	  Check that node is marked dirty each time it's modified. This is done
+	  through maintaining checksum of node content. CPU hog.
+
+config REISER4_DEBUG_MEMCPY
+	bool "Memory copying"
+	depends on REISER4_CHECK
+	help
+	  Use special non-inlined versions on memcpy, memset, and memmove in
+	  reiser4 to estimate amount of CPU time spent in data copying.
+
+config REISER4_DEBUG_NODE
+	bool "Node consistency"
+	depends on REISER4_CHECK
+	help
+	  Run consistency checks on nodes in balanced tree. CPU hog.
+
+config REISER4_ZERO_NEW_NODE
+	bool "Node zeroing"
+	depends on REISER4_CHECK
+	help
+	  Zero new node before use.
+
+config REISER4_TRACE
+	bool "Tracing"
+	depends on REISER4_CHECK
+	help
+	  Turn on tracing facility. This enables trace_flags mount option.
+
+config REISER4_EVENT_LOG
+	bool "Log events"
+	depends on REISER4_CHECK
+	help
+	  Log events into user supplied file. This enables trace_file mount option.
+
+config REISER4_STATS
+	bool "Statistics"
+	depends on REISER4_CHECK
+	help
+	  Turn on statistics collection. This increases size of in-memory super
+	  block considerably.
+
+config REISER4_DEBUG_OUTPUT
+	bool "Printing"
+	depends on REISER4_CHECK
+	help
+	  Enable compilation of functions that print internal kernel data
+	  structures in human readable form. Useful for debugging.
Index: linux-2.6.8.1-ck/fs/Makefile
===================================================================
--- linux-2.6.8.1-ck.orig/fs/Makefile	2004-08-22 19:35:18.617044470 +1000
+++ linux-2.6.8.1-ck/fs/Makefile	2004-08-22 19:35:33.604655407 +1000
@@ -46,6 +46,7 @@ obj-$(CONFIG_PROFILING)		+= dcookies.o
  
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
+obj-$(CONFIG_REISER4_FS)	+= reiser4/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_EXT2_FS)		+= ext2/
Index: linux-2.6.8.1-ck/fs/namei.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/namei.c	2004-08-22 19:10:09.334926374 +1000
+++ linux-2.6.8.1-ck/fs/namei.c	2004-08-22 19:35:33.605655247 +1000
@@ -1165,7 +1165,7 @@ struct dentry *lock_rename(struct dentry
 {
 	struct dentry *p;
 
-	if (p1 == p2) {
+	if (p1->d_inode == p2->d_inode) {
 		down(&p1->d_inode->i_sem);
 		return NULL;
 	}
@@ -1196,7 +1196,7 @@ struct dentry *lock_rename(struct dentry
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
 	up(&p1->d_inode->i_sem);
-	if (p1 != p2) {
+	if (p1->d_inode != p2->d_inode) {
 		up(&p2->d_inode->i_sem);
 		up(&p1->d_inode->i_sb->s_vfs_rename_sem);
 	}
Index: linux-2.6.8.1-ck/fs/reiser4/as_ops.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/as_ops.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/as_ops.c	2004-08-22 19:35:33.606655088 +1000
@@ -0,0 +1,688 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/file/file.h"
+#include "plugin/security/perm.h"
+#include "plugin/disk_format/disk_format.h"
+#include "plugin/plugin.h"
+#include "plugin/plugin_set.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "log.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "ktxnmgrd.h"
+#include "super.h"
+#include "reiser4.h"
+#include "kattr.h"
+#include "entd.h"
+#include "emergency_flush.h"
+
+#include <linux/profile.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/quotaops.h>
+#include <linux/security.h>
+
+/* address space operations */
+
+static int reiser4_readpage(struct file *, struct page *);
+
+static int reiser4_prepare_write(struct file *,
+				 struct page *, unsigned, unsigned);
+
+static int reiser4_commit_write(struct file *,
+				struct page *, unsigned, unsigned);
+
+static int reiser4_set_page_dirty (struct page *);
+static sector_t reiser4_bmap(struct address_space *, sector_t);
+/* static int reiser4_direct_IO(int, struct inode *,
+			     struct kiobuf *, unsigned long, int); */
+
+/* address space operations */
+
+/* clear PAGECACHE_TAG_DIRTY tag of a page. This is used in uncapture_page.  This resembles test_clear_page_dirty. The
+   only difference is that page's mapping exists and REISER4_MOVED tag is checked */
+reiser4_internal void
+reiser4_clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping;
+	unsigned long flags;
+
+	mapping = page->mapping;
+	BUG_ON(mapping == NULL);
+
+	read_lock_irqsave(&mapping->tree_lock, flags);
+	if (TestClearPageDirty(page)) {
+		/* clear dirty tag of page in address space radix tree */
+		radix_tree_tag_clear(&mapping->page_tree, page->index,
+				     PAGECACHE_TAG_DIRTY);
+		/* FIXME: remove this when reiser4_set_page_dirty will skip setting this tag for captured pages */
+		radix_tree_tag_clear(&mapping->page_tree, page->index,
+				     PAGECACHE_TAG_REISER4_MOVED);
+
+		read_unlock_irqrestore(&mapping->tree_lock, flags);
+		if (!mapping->backing_dev_info->memory_backed)
+			dec_page_state(nr_dirty);
+		return;
+	}
+	read_unlock_irqrestore(&mapping->tree_lock, flags);
+}
+
+/* as_ops->set_page_dirty() VFS method in reiser4_address_space_operations.
+
+   It is used by others (except reiser4) to set reiser4 pages dirty. Reiser4
+   itself uses set_page_dirty_internal().
+
+   The difference is that reiser4_set_page_dirty sets MOVED tag on the page and clears DIRTY tag. Pages tagged as MOVED
+   get processed by reiser4_writepages() to do reiser4 specific work over dirty pages (allocation jnode, capturing, atom
+   creation) which cannot be done in the contexts where reiser4_set_page_dirty is called.
+   set_page_dirty_internal sets DIRTY tag and clear MOVED
+*/
+static int reiser4_set_page_dirty(struct page *page /* page to mark dirty */)
+{
+	if (!TestSetPageDirty(page)) {
+		struct address_space *mapping = page->mapping;
+
+		if (mapping) {
+			read_lock_irq(&mapping->tree_lock);
+			/* check for race with truncate */
+			if (page->mapping) {
+				assert("vs-1652", page->mapping == mapping);
+				if (!mapping->backing_dev_info->memory_backed)
+					inc_page_state(nr_dirty);
+				radix_tree_tag_clear(&mapping->page_tree,
+						   page->index, PAGECACHE_TAG_DIRTY);
+				/* FIXME: if would be nice to not set this tag on pages which are captured already */
+				radix_tree_tag_set(&mapping->page_tree,
+						   page->index, PAGECACHE_TAG_REISER4_MOVED);
+			}
+			read_unlock_irq(&mapping->tree_lock);
+			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+		}
+	}
+	return 0;
+}
+
+/* ->readpage() VFS method in reiser4 address_space_operations
+   method serving file mmapping
+*/
+static int
+reiser4_readpage(struct file *f /* file to read from */ ,
+		 struct page *page	/* page where to read data
+					 * into */ )
+{
+	struct inode *inode;
+	file_plugin *fplug;
+	int result;
+	reiser4_context ctx;
+
+	/*
+	 * basically calls ->readpage method of object plugin and handles
+	 * errors.
+	 */
+
+	assert("umka-078", f != NULL);
+	assert("umka-079", page != NULL);
+	assert("nikita-2280", PageLocked(page));
+	assert("vs-976", !PageUptodate(page));
+
+	assert("vs-318", page->mapping && page->mapping->host);
+	assert("nikita-1352", (f == NULL) || (f->f_dentry->d_inode == page->mapping->host));
+
+	/* ->readpage can be called from page fault service routine */
+	assert("nikita-3174", schedulable());
+
+	inode = page->mapping->host;
+	init_context(&ctx, inode->i_sb);
+	fplug = inode_file_plugin(inode);
+	if (fplug->readpage != NULL)
+		result = fplug->readpage(f, page);
+	else
+		result = RETERR(-EINVAL);
+	if (result != 0) {
+		SetPageError(page);
+		unlock_page(page);
+	}
+
+	reiser4_exit_context(&ctx);
+	return 0;
+}
+
+/* ->readpages() VFS method in reiser4 address_space_operations
+   method serving page cache readahead
+
+   reiser4_readpages works in the following way: on input it has coord which is set on extent that addresses first of
+   pages for which read requests are to be issued. So, reiser4_readpages just walks forward through extent unit, finds
+   which blocks are to be read and start read for them.
+
+reiser4_readpages can be called from two places: from
+sys_read->reiser4_read->read_unix_file->read_extent->page_cache_readahead and
+from
+handling page fault:
+handle_mm_fault->do_no_page->filemap_nopage->page_cache_readaround
+
+In first case coord is set by reiser4 read code. This case is detected by  if
+(is_in_reiser4_context()).
+
+In second case, coord is not set and currently, reiser4_readpages does
+nothing.
+*/
+static int
+reiser4_readpages(struct file *file, struct address_space *mapping,
+		  struct list_head *pages, unsigned nr_pages)
+{
+	file_plugin *fplug;
+
+	if (is_in_reiser4_context()) {
+		/* we are called from reiser4 context, typically from method
+		   which implements read into page cache. From read_extent,
+		   for example */
+		fplug = inode_file_plugin(mapping->host);
+		if (fplug->readpages)
+			fplug->readpages(file, mapping, pages);
+	} else {
+		/* we are called from page fault. Currently, we do not
+		 * readahead in this case. */;
+	}
+
+	/* __do_page_cache_readahead expects filesystem's readpages method to
+	 * process every page on this list */
+	while (!list_empty(pages)) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+		page_cache_release(page);
+	}
+	return 0;
+}
+
+/* prepares @page to be written. This means, that if we want to modify only some
+   part of page, page should be read first and than modified. Actually this function
+   almost the same as reiser4_readpage(). The differentce is only that, it does not
+   unlock the page in the case of error. This is needed because loop back device
+   driver expects it locked. */
+static int reiser4_prepare_write(struct file *file, struct page *page,
+				 unsigned from, unsigned to)
+{
+	int result;
+	file_plugin * fplug;
+	struct inode * inode;
+	reiser4_context ctx;
+
+	inode = page->mapping->host;
+	init_context(&ctx, inode->i_sb);
+	fplug = inode_file_plugin(inode);
+
+	if (fplug->prepare_write != NULL)
+		result = fplug->prepare_write(file, page, from, to);
+	else
+		result = RETERR(-EINVAL);
+
+	/* don't commit transaction under inode semaphore */
+	context_set_commit_async(&ctx);
+	reiser4_exit_context(&ctx);
+
+	return result;
+}
+
+/* captures jnode of @page to current atom. */
+static int reiser4_commit_write(struct file *file, struct page *page,
+				unsigned from, unsigned to)
+{
+	int result;
+	file_plugin *fplug;
+	struct inode *inode;
+	reiser4_context ctx;
+
+	assert("umka-3101", file != NULL);
+	assert("umka-3102", page != NULL);
+	assert("umka-3093", PageLocked(page));
+
+	SetPageUptodate(page);
+
+	inode = page->mapping->host;
+	init_context(&ctx, inode->i_sb);
+	fplug = inode_file_plugin(inode);
+
+	if (fplug->capturepage)
+		result = fplug->capturepage(page);
+	else
+		result = RETERR(-EINVAL);
+
+	/* here page is return locked. */
+	assert("umka-3103", PageLocked(page));
+
+	/* don't commit transaction under inode semaphore */
+	context_set_commit_async(&ctx);
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* ->writepages()
+   ->vm_writeback()
+   ->set_page_dirty()
+   ->prepare_write()
+   ->commit_write()
+*/
+
+/* ->bmap() VFS method in reiser4 address_space_operations */
+reiser4_internal int
+reiser4_lblock_to_blocknr(struct address_space *mapping,
+			  sector_t lblock, reiser4_block_nr *blocknr)
+{
+	file_plugin *fplug;
+	int result;
+	reiser4_context ctx;
+
+	init_context(&ctx, mapping->host->i_sb);
+	reiser4_stat_inc(vfs_calls.bmap);
+
+	fplug = inode_file_plugin(mapping->host);
+	if (fplug && fplug->get_block) {
+		*blocknr = generic_block_bmap(mapping, lblock, fplug->get_block);
+		result = 0;
+	} else
+		result = RETERR(-EINVAL);
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* ->bmap() VFS method in reiser4 address_space_operations */
+static sector_t
+reiser4_bmap(struct address_space *mapping, sector_t lblock)
+{
+	reiser4_block_nr blocknr;
+	int result;
+
+	result = reiser4_lblock_to_blocknr(mapping, lblock, &blocknr);
+	if (result == 0)
+		if (sizeof blocknr == sizeof(sector_t) ||
+		    !blocknr_is_fake(&blocknr))
+			return blocknr;
+		else
+			return 0;
+	else
+		return result;
+}
+
+/* ->invalidatepage method for reiser4 */
+
+/*
+ * this is called for each truncated page from
+ * truncate_inode_pages()->truncate_{complete,partial}_page().
+ *
+ * At the moment of call, page is under lock, and outstanding io (if any) has
+ * completed.
+ */
+
+reiser4_internal int
+reiser4_invalidatepage(struct page *page /* page to invalidate */,
+		       unsigned long offset /* starting offset for partial
+					     * invalidation */)
+{
+	int ret = 0;
+	reiser4_context ctx;
+	struct inode *inode;
+
+	/*
+	 * This is called to truncate file's page.
+	 *
+	 * Originally, reiser4 implemented truncate in a standard way
+	 * (vmtruncate() calls ->invalidatepage() on all truncated pages
+	 * first, then file system ->truncate() call-back is invoked).
+	 *
+	 * This lead to the problem when ->invalidatepage() was called on a
+	 * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
+	 * process. That is, truncate was bypassing transactions. To avoid
+	 * this, try_capture_page_to_invalidate() call was added here.
+	 *
+	 * After many troubles with vmtruncate() based truncate (including
+	 * races with flush, tail conversion, etc.) it was re-written in the
+	 * top-to-bottom style: items are killed in cut_tree_object() and
+	 * pages belonging to extent are invalidated in kill_hook_extent(). So
+	 * probably now additional call to capture is not needed here.
+	 *
+	 */
+
+	assert("nikita-3137", PageLocked(page));
+	assert("nikita-3138", !PageWriteback(page));
+	inode = page->mapping->host;
+
+	/*
+	 * ->invalidatepage() should only be called for the unformatted
+	 * jnodes. Destruction of all other types of jnodes is performed
+	 * separately. But, during some corner cases (like handling errors
+	 * during mount) it is simpler to let ->invalidatepage to be called on
+	 * them. Check for this, and do nothing.
+	 */
+	if (get_super_fake(inode->i_sb) == inode)
+		return 0;
+	if (get_cc_fake(inode->i_sb) == inode)
+		return 0;
+	if (get_super_private(inode->i_sb)->bitmap == inode)
+		return 0;
+
+	assert("vs-1426", PagePrivate(page));
+	assert("vs-1427", page->mapping == jnode_get_mapping(jnode_by_page(page)));
+
+	init_context(&ctx, inode->i_sb);
+	/* capture page being truncated. */
+	ret = try_capture_page_to_invalidate(page);
+	if (ret != 0) {
+		warning("nikita-3141", "Cannot capture: %i", ret);
+		print_page("page", page);
+	}
+
+	if (offset == 0) {
+		jnode *node;
+
+		/* remove jnode from transaction and detach it from page. */
+		node = jnode_by_page(page);
+		if (node != NULL) {
+			assert("vs-1435", !JF_ISSET(node, JNODE_CC));
+			jref(node);
+ 			JF_SET(node, JNODE_HEARD_BANSHEE);
+			/* page cannot be detached from jnode concurrently,
+			 * because it is locked */
+			uncapture_page(page);
+
+			/* this detaches page from jnode, so that jdelete will not try to lock page which is already locked */
+			UNDER_SPIN_VOID(jnode,
+					node,
+					page_clear_jnode(page, node));
+			unhash_unformatted_jnode(node);
+
+			jput(node);
+		}
+	}
+	reiser4_exit_context(&ctx);
+	return ret;
+}
+
+#define INC_STAT(page, node, counter)						\
+	reiser4_stat_inc_at(page->mapping->host->i_sb, 				\
+			    level[jnode_get_level(node)].counter);
+
+#define INC_NSTAT(node, counter) INC_STAT(jnode_page(node), node, counter)
+
+int is_cced(const jnode *node);
+
+/* help function called from reiser4_releasepage(). It returns true if jnode
+ * can be detached from its page and page released. */
+static int
+releasable(const jnode *node /* node to check */)
+{
+	assert("nikita-2781", node != NULL);
+	assert("nikita-2783", spin_jnode_is_locked(node));
+
+	/* is some thread is currently using jnode page, later cannot be
+	 * detached */
+	if (atomic_read(&node->d_count) != 0) {
+		INC_NSTAT(node, vm.release.loaded);
+		return 0;
+	}
+
+	assert("vs-1214", !jnode_is_loaded(node));
+
+	/* this jnode is just a copy. Its page cannot be released, because
+	 * otherwise next jload() would load obsolete data from disk
+	 * (up-to-date version may still be in memory). */
+	if (is_cced(node)) {
+		INC_NSTAT(node, vm.release.copy);
+		return 0;
+	}
+
+	/* emergency flushed page can be released. This is what emergency
+	 * flush is all about after all. */
+	if (JF_ISSET(node, JNODE_EFLUSH)) {
+		INC_NSTAT(node, vm.release.eflushed);
+		return 1; /* yeah! */
+	}
+
+	/* can only release page if real block number is assigned to
+	   it. Simple check for ->atom wouldn't do, because it is possible for
+	   node to be clean, not it atom yet, and still having fake block
+	   number. For example, node just created in jinit_new(). */
+	if (blocknr_is_fake(jnode_get_block(node))) {
+		INC_NSTAT(node, vm.release.fake);
+		return 0;
+	}
+	/* dirty jnode cannot be released. It can however be submitted to disk
+	 * as part of early flushing, but only after getting flush-prepped. */
+	if (jnode_is_dirty(node)) {
+		INC_NSTAT(node, vm.release.dirty);
+		return 0;
+	}
+	/* overwrite set is only written by log writer. */
+	if (JF_ISSET(node, JNODE_OVRWR)) {
+		INC_NSTAT(node, vm.release.ovrwr);
+		return 0;
+	}
+	/* jnode is already under writeback */
+	if (JF_ISSET(node, JNODE_WRITEBACK)) {
+		INC_NSTAT(node, vm.release.writeback);
+		return 0;
+	}
+	/* page was modified through mmap, but its jnode is not yet
+	 * captured. Don't discard modified data. */
+	if (jnode_is_unformatted(node) && JF_ISSET(node, JNODE_KEEPME)) {
+		INC_NSTAT(node, vm.release.keepme);
+		return 0;
+	}
+	/* don't flush bitmaps or journal records */
+	if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) {
+		INC_NSTAT(node, vm.release.bitmap);
+		return 0;
+	}
+	return 1;
+}
+
+#if REISER4_DEBUG
+int jnode_is_releasable(jnode *node)
+{
+	return UNDER_SPIN(jload, node, releasable(node));
+}
+#endif
+
+/*
+ * ->releasepage method for reiser4
+ *
+ * This is called by VM scanner when it comes across clean page.  What we have
+ * to do here is to check whether page can really be released (freed that is)
+ * and if so, detach jnode from it and remove page from the page cache.
+ *
+ * Check for releasability is done by releasable() function.
+ */
+reiser4_internal int
+reiser4_releasepage(struct page *page, int gfp UNUSED_ARG)
+{
+	jnode *node;
+	void *oid;
+
+	assert("nikita-2257", PagePrivate(page));
+	assert("nikita-2259", PageLocked(page));
+	assert("nikita-2892", !PageWriteback(page));
+	assert("nikita-3019", schedulable());
+
+	/* NOTE-NIKITA: this can be called in the context of reiser4 call. It
+	   is not clear what to do in this case. A lot of deadlocks seems be
+	   possible. */
+
+	node = jnode_by_page(page);
+	assert("nikita-2258", node != NULL);
+	assert("reiser4-4", page->mapping != NULL);
+	assert("reiser4-5", page->mapping->host != NULL);
+
+	INC_STAT(page, node, vm.release.try);
+
+	oid = (void *)(unsigned long)get_inode_oid(page->mapping->host);
+
+	/* is_page_cache_freeable() check
+	   (mapping + private + page_cache_get() by shrink_cache()) */
+	if (page_count(page) > 3)
+		return 0;
+
+	if (PageDirty(page))
+		return 0;
+
+	/* releasable() needs jnode lock, because it looks at the jnode fields
+	 * and we need jload_lock here to avoid races with jload(). */
+	LOCK_JNODE(node);
+	LOCK_JLOAD(node);
+	if (releasable(node)) {
+		struct address_space *mapping;
+
+		mapping = page->mapping;
+		INC_STAT(page, node, vm.release.ok);
+		jref(node);
+		if (jnode_is_znode(node))
+			ON_STATS(znode_at_read(JZNODE(node)));
+		/* there is no need to synchronize against
+		 * jnode_extent_write() here, because pages seen by
+		 * jnode_extent_write() are !releasable(). */
+		page_clear_jnode(page, node);
+		UNLOCK_JLOAD(node);
+		UNLOCK_JNODE(node);
+
+		/* we are under memory pressure so release jnode also. */
+		jput(node);
+
+		write_lock_irq(&mapping->tree_lock);
+		/* shrink_list() + radix-tree */
+		if (page_count(page) == 2) {
+			__remove_from_page_cache(page);
+			__put_page(page);
+		}
+		write_unlock_irq(&mapping->tree_lock);
+
+		return 1;
+	} else {
+		UNLOCK_JLOAD(node);
+		UNLOCK_JNODE(node);
+		assert("nikita-3020", schedulable());
+		return 0;
+	}
+}
+
+#undef INC_NSTAT
+#undef INC_STAT
+
+reiser4_internal void
+move_inode_out_from_sync_inodes_loop(struct address_space * mapping)
+{
+	/* work around infinite loop in pdflush->sync_sb_inodes. */
+	/* Problem: ->writepages() is supposed to submit io for the pages from
+	 * ->io_pages list and to clean this list. */
+	mapping->host->dirtied_when = jiffies;
+	spin_lock(&inode_lock);
+	list_move(&mapping->host->i_list, &mapping->host->i_sb->s_dirty);
+	spin_unlock(&inode_lock);
+
+}
+
+/* reiser4 writepages() address space operation this captures anonymous pages
+   and anonymous jnodes. Anonymous pages are pages which are dirtied via
+   mmapping. Anonymous jnodes are ones which were created by reiser4_writepage
+ */
+reiser4_internal int
+reiser4_writepages(struct address_space *mapping,
+		   struct writeback_control *wbc)
+{
+	int ret = 0;
+	struct inode *inode;
+	file_plugin *fplug;
+
+	inode = mapping->host;
+	fplug = inode_file_plugin(inode);
+	if (fplug != NULL && fplug->capture != NULL) {
+		long captured = 0;
+
+		/* call file plugin method to capture anonymous pages and
+		 * anonymous jnodes */
+		ret = fplug->capture(inode, wbc, &captured);
+	}
+
+	move_inode_out_from_sync_inodes_loop(mapping);
+	return ret;
+}
+
+/* start actual IO on @page */
+reiser4_internal int reiser4_start_up_io(struct page *page)
+{
+	block_sync_page(page);
+	return 0;
+}
+
+/*
+ * reiser4 methods for VM
+ */
+struct address_space_operations reiser4_as_operations = {
+	/* called during memory pressure by kswapd */
+	.writepage = reiser4_writepage,
+	/* called to read page from the storage when page is added into page
+	   cache. This is done by page-fault handler. */
+	.readpage = reiser4_readpage,
+	/* Start IO on page. This is called from wait_on_page_bit() and
+	   lock_page() and its purpose is to actually start io by jabbing
+	   device drivers. */
+	.sync_page = reiser4_start_up_io,
+	/* called from
+	 * reiser4_sync_inodes()->generic_sync_sb_inodes()->...->do_writepages()
+	 *
+	 * captures anonymous pages for given inode
+	 */
+	.writepages = reiser4_writepages,
+	/* marks page dirty. Note that this is never called by reiser4
+	 * directly. Reiser4 uses set_page_dirty_internal(). Reiser4 set page
+	 * dirty is called for pages dirtied though mmap and moves dirty page
+	 * to the special ->moved_list in its mapping. */
+	.set_page_dirty = reiser4_set_page_dirty,
+	/* called during read-ahead */
+	.readpages = reiser4_readpages,
+	.prepare_write = reiser4_prepare_write, /* loop back device driver and generic_file_write() call-back */
+	.commit_write = reiser4_commit_write,  /* loop back device driver and generic_file_write() call-back */
+	/* map logical block number to disk block number. Used by FIBMAP ioctl
+	 * and ..bmap pseudo file. */
+	.bmap = reiser4_bmap,
+	/* called just before page is taken out from address space (on
+	   truncate, umount, or similar).  */
+	.invalidatepage = reiser4_invalidatepage,
+	/* called when VM is about to take page from address space (due to
+	   memory pressure). */
+	.releasepage = reiser4_releasepage,
+	/* not yet implemented */
+	.direct_IO = NULL
+};
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/block_alloc.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/block_alloc.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/block_alloc.c	2004-08-22 19:35:33.608654769 +1000
@@ -0,0 +1,1196 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "debug.h"
+#include "dformat.h"
+#include "plugin/plugin.h"
+#include "txnmgr.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "super.h"
+#include "lib.h"
+
+#include <linux/types.h>	/* for __u??  */
+#include <linux/fs.h>		/* for struct super_block  */
+#include <linux/spinlock.h>
+
+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
+
+/* We need to be able to reserve enough disk space to ensure that an atomic
+   operation will have enough disk space to flush (see flush.c and
+   http://namesys.com/v4/v4.html) and commit it once it is started.
+
+   In our design a call for reserving disk space may fail but not an actual
+   block allocation.
+
+   All free blocks, already allocated blocks, and all kinds of reserved blocks
+   are counted in different per-fs block counters.
+
+   A reiser4 super block's set of block counters currently is:
+
+   free -- free blocks,
+   used -- already allocated blocks,
+
+   grabbed -- initially reserved for performing an fs operation, those blocks
+          are taken from free blocks, then grabbed disk space leaks from grabbed
+          blocks counter to other counters like "fake allocated", "flush
+          reserved", "used", the rest of not used grabbed space is returned to
+          free space at the end of fs operation;
+
+   fake allocated -- counts all nodes without real disk block numbers assigned,
+                     we have separate accounting for formatted and unformatted
+                     nodes (for easier debugging);
+
+   flush reserved -- disk space needed for flushing and committing an atom.
+                     Each dirty already allocated block could be written as a
+                     part of atom's overwrite set or as a part of atom's
+                     relocate set.  In both case one additional block is needed,
+                     it is used as a wandered block if we do overwrite or as a
+		     new location for a relocated block.
+
+   In addition, blocks in some states are counted on per-thread and per-atom
+   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
+   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
+   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
+   blocks, which are reserved for flush processing and atom commit. */
+
+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
+   number of blocks to grab for most expensive case of balancing when the leaf
+   node we insert new item to gets split and new leaf node is allocated.
+
+   So, we need to grab blocks for
+
+   1) one block for possible dirtying the node we insert an item to. That block
+      would be used for node relocation at flush time or for allocating of a
+      wandered one, it depends what will be a result (what set, relocate or
+      overwrite the node gets assigned to) of the node processing by the flush
+      algorithm.
+
+   2) one block for either allocating a new node, or dirtying of right or left
+      clean neighbor, only one case may happen.
+
+   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
+   node, and creation of new node.  have I forgotten something?  email me.
+
+   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
+   counter and in the fs-wide one (both ctx->grabbed_blocks and
+   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
+   decremented by 2.
+
+   Suppose both two blocks were spent for dirtying of an already allocated clean
+   node (one block went from "grabbed" to "flush reserved") and for new block
+   allocating (one block went from "grabbed" to "fake allocated formatted").
+
+   Inserting of a child pointer to the parent node caused parent node to be
+   split, the balancing code takes care about this grabbing necessary space
+   immediately by calling reiser4_grab with BA_RESERVED flag set which means
+   "can use the 5% reserved disk space".
+
+   At this moment insertion completes and grabbed blocks (if they were not used)
+   should be returned to the free space counter.
+
+   However the atom life-cycle is not completed.  The atom had one "flush
+   reserved" block added by our insertion and the new fake allocated node is
+   counted as a "fake allocated formatted" one.  The atom has to be fully
+   processed by flush before commit.  Suppose that the flush moved the first,
+   already allocated node to the atom's overwrite list, the new fake allocated
+   node, obviously, went into the atom relocate set.  The reiser4 flush
+   allocates the new node using one unit from "fake allocated formatted"
+   counter, the log writer uses one from "flush reserved" for wandered block
+   allocation.
+
+   And, it is not the end.  When the wandered block is deallocated after the
+   atom gets fully played (see wander.c for term description), the disk space
+   occupied for it is returned to free blocks. */
+
+/* BLOCK NUMBERS */
+
+/* Any reiser4 node has a block number assigned to it.  We use these numbers for
+   indexing in hash tables, so if a block has not yet been assigned a location
+   on disk we need to give it a temporary fake block number.
+
+   Current implementation of reiser4 uses 64-bit integers for block numbers. We
+   use highest bit in 64-bit block number to distinguish fake and real block
+   numbers. So, only 63 bits may be used to addressing of real device
+   blocks. That "fake" block numbers space is divided into subspaces of fake
+   block numbers for data blocks and for shadow (working) bitmap blocks.
+
+   Fake block numbers for data blocks are generated by a cyclic counter, which
+   gets incremented after each real block allocation. We assume that it is
+   impossible to overload this counter during one transaction life. */
+
+/* Initialize a blocknr hint. */
+reiser4_internal void
+blocknr_hint_init(reiser4_blocknr_hint * hint)
+{
+	xmemset(hint, 0, sizeof (reiser4_blocknr_hint));
+}
+
+/* Release any resources of a blocknr hint. */
+reiser4_internal void
+blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
+{
+	/* No resources should be freed in current blocknr_hint implementation.*/
+}
+
+/* see above for explanation of fake block number.  */
+/* Audited by: green(2002.06.11) */
+reiser4_internal int
+blocknr_is_fake(const reiser4_block_nr * da)
+{
+	/* The reason for not simply returning result of '&' operation is that
+	   while return value is (possibly 32bit) int,  the reiser4_block_nr is
+	   at least 64 bits long, and high bit (which is the only possible
+	   non zero bit after the masking) would be stripped off */
+	return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
+}
+
+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
+   arithmetic. Mostly, they are isolated to not to code same assertions in
+   several places. */
+static void
+sub_from_ctx_grabbed(reiser4_context *ctx, __u64 count)
+{
+	if (ctx->grabbed_blocks < count)
+		print_clog();
+	BUG_ON(ctx->grabbed_blocks < count);
+	assert("zam-527", ctx->grabbed_blocks >= count);
+	ctx->grabbed_blocks -= count;
+}
+
+
+static void
+sub_from_sb_grabbed(reiser4_super_info_data *sbinfo, __u64 count)
+{
+	assert("zam-525", sbinfo->blocks_grabbed >= count);
+	sbinfo->blocks_grabbed -= count;
+}
+
+/* Decrease the counter of block reserved for flush in super block. */
+static void
+sub_from_sb_flush_reserved (reiser4_super_info_data *sbinfo, __u64 count)
+{
+	assert ("vpf-291", sbinfo->blocks_flush_reserved >= count);
+	sbinfo->blocks_flush_reserved -= count;
+}
+
+static void
+sub_from_sb_fake_allocated(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags)
+{
+	if (flags & BA_FORMATTED) {
+		assert("zam-806", sbinfo->blocks_fake_allocated >= count);
+		sbinfo->blocks_fake_allocated -= count;
+	} else {
+		assert("zam-528", sbinfo->blocks_fake_allocated_unformatted >= count);
+		sbinfo->blocks_fake_allocated_unformatted -= count;
+	}
+}
+
+static void
+sub_from_sb_used(reiser4_super_info_data *sbinfo, __u64 count)
+{
+	assert("zam-530", sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
+	sbinfo->blocks_used -= count;
+}
+
+static void
+sub_from_cluster_reserved(reiser4_super_info_data *sbinfo, __u64 count)
+{
+	assert("edward-501", sbinfo->blocks_clustered >= count);
+	sbinfo->blocks_clustered -= count;
+}
+
+/* Increase the counter of block reserved for flush in atom. */
+static void
+add_to_atom_flush_reserved_nolock (txn_atom * atom, __u32 count)
+{
+	assert ("zam-772", atom != NULL);
+	assert ("zam-773", spin_atom_is_locked (atom));
+	atom->flush_reserved += count;
+}
+
+/* Decrease the counter of block reserved for flush in atom. */
+static void
+sub_from_atom_flush_reserved_nolock (txn_atom * atom, __u32 count)
+{
+	assert ("zam-774", atom != NULL);
+	assert ("zam-775", spin_atom_is_locked (atom));
+	assert ("nikita-2790", atom->flush_reserved >= count);
+	atom->flush_reserved -= count;
+}
+
+/* super block has 6 counters: free, used, grabbed, fake allocated
+   (formatted and unformatted) and flush reserved. Their sum must be
+   number of blocks on a device. This function checks this */
+reiser4_internal int
+check_block_counters(const struct super_block *super)
+{
+	__u64 sum;
+
+	sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
+	    	reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
+		reiser4_fake_allocated_unformatted(super) + flush_reserved(super) +
+		reiser4_clustered_blocks(super);
+	if (reiser4_block_count(super) != sum) {
+		printk("super block counters: "
+		       "used %llu, free %llu, "
+		       "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
+		       "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
+		       reiser4_data_blocks(super),
+		       reiser4_free_blocks(super),
+		       reiser4_grabbed_blocks(super),
+		       reiser4_fake_allocated(super),
+		       reiser4_fake_allocated_unformatted(super),
+		       flush_reserved(super),
+		       reiser4_clustered_blocks(super),
+		       sum, reiser4_block_count(super));
+		return 0;
+	}
+	return 1;
+}
+
+#if REISER4_DEBUG_OUTPUT
+reiser4_internal void
+print_block_counters(const char *prefix,
+		     const struct super_block *super, txn_atom *atom)
+{
+	if (super == NULL)
+		super = reiser4_get_current_sb();
+	printk("%s:\tsuper: G: %llu, F: %llu, D: %llu, U: %llu + %llu, R: %llu, C: %llu, T: %llu\n",
+	       prefix,
+	       reiser4_grabbed_blocks(super),
+	       reiser4_free_blocks(super),
+	       reiser4_data_blocks(super),
+	       reiser4_fake_allocated(super),
+	       reiser4_fake_allocated_unformatted(super),
+	       flush_reserved(super),
+	       reiser4_clustered_blocks(super),
+	       reiser4_block_count(super));
+	printk("\tcontext: G: %llu",
+	       get_current_context()->grabbed_blocks);
+	if (atom == NULL)
+		atom = get_current_atom_locked_nocheck();
+	if (atom != NULL) {
+		printk("\tatom: R: %llu", atom->flush_reserved);
+		UNLOCK_ATOM(atom);
+	}
+	printk("\n");
+}
+#endif
+
+/* Adjust "working" free blocks counter for number of blocks we are going to
+   allocate.  Record number of grabbed blocks in fs-wide and per-thread
+   counters.  This function should be called before bitmap scanning or
+   allocating fake block numbers
+
+   @super           -- pointer to reiser4 super block;
+   @count           -- number of blocks we reserve;
+
+   @return          -- 0 if success,  -ENOSPC, if all
+                       free blocks are preserved or already allocated.
+*/
+
+static int
+reiser4_grab(reiser4_context *ctx, __u64 count, reiser4_ba_flags_t flags)
+{
+	__u64 free_blocks;
+	int ret = 0, use_reserved = flags & BA_RESERVED;
+	reiser4_super_info_data *sbinfo;
+
+	assert("vs-1276", ctx == get_current_context());
+
+	sbinfo = get_super_private(ctx->super);
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	free_blocks = sbinfo->blocks_free;
+
+	ON_TRACE(TRACE_ALLOC, "reiser4_grab: free_blocks %llu\n", free_blocks);
+
+	if ((use_reserved && free_blocks < count) ||
+	    (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
+		ret = RETERR(-ENOSPC);
+
+		ON_TRACE(TRACE_ALLOC, "reiser4_grab: ENOSPC: count %llu\n", count);
+
+		goto unlock_and_ret;
+	}
+
+	ctx->grabbed_blocks += count;
+
+	sbinfo->blocks_grabbed += count;
+	sbinfo->blocks_free -= count;
+
+#if REISER4_DEBUG
+	ctx->grabbed_initially = count;
+	fill_backtrace(&ctx->grabbed_at, REISER4_BACKTRACE_DEPTH, 0);
+#endif
+
+	assert("nikita-2986", check_block_counters(ctx->super));
+
+	ON_TRACE(TRACE_ALLOC, "%s: grabbed %llu, free blocks left %llu\n",
+		 __FUNCTION__, count, reiser4_free_blocks (ctx->super));
+
+	/* disable grab space in current context */
+	ctx->grab_enabled = 0;
+
+unlock_and_ret:
+	reiser4_spin_unlock_sb(sbinfo);
+
+	return ret;
+}
+
+reiser4_internal int
+reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
+{
+	int ret;
+	reiser4_context *ctx;
+
+	assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
+				   lock_stack_isclean(get_current_lock_stack())));
+	ON_TRACE(TRACE_RESERVE, "grab_space: %llu block(s).", count);
+
+	ctx = get_current_context();
+	if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
+		ON_TRACE(TRACE_RESERVE, "grab disabled and not forced!\n");
+		return 0;
+	}
+
+	ret = reiser4_grab(ctx, count, flags);
+	if (ret == -ENOSPC) {
+
+		/* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
+		if (flags & BA_CAN_COMMIT) {
+
+			ON_TRACE(TRACE_RESERVE, "force commit!..");
+
+			txnmgr_force_commit_all(ctx->super, 0);
+
+			ctx->grab_enabled = 1;
+			ret = reiser4_grab(ctx, count, flags);
+		}
+	}
+	ON_TRACE(TRACE_RESERVE, "%s(%d)\n", (ret == 0) ? "ok" : "failed", ret);
+	/*
+	 * allocation from reserved pool cannot fail. This is severe error.
+	 */
+	assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
+	return ret;
+}
+
+/*
+ * SPACE RESERVED FOR UNLINK/TRUNCATE
+ *
+ * Unlink and truncate require space in transaction (to update stat data, at
+ * least). But we don't want rm(1) to fail with "No space on device" error.
+ *
+ * Solution is to reserve 5% of disk space for truncates and
+ * unlinks. Specifically, normal space grabbing requests don't grab space from
+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
+ * drain it. Per super block delete_sema semaphore is used to allow only one
+ * thread at a time to grab from reserved area.
+ *
+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
+ * flag.
+ *
+ */
+
+reiser4_internal int reiser4_grab_reserved(struct super_block *super,
+					   __u64 count, reiser4_ba_flags_t flags)
+{
+	reiser4_super_info_data *sbinfo = get_super_private(super);
+
+	assert("nikita-3175", flags & BA_CAN_COMMIT);
+
+	/* Check the delete semaphore already taken by us, we assume that
+	 * reading of machine word is atomic. */
+	if (sbinfo->delete_sema_owner == current) {
+		if (reiser4_grab_space(count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
+			warning("zam-1003", "nested call of grab_reserved fails count=(%llu)",
+				(unsigned long long)count);
+			reiser4_release_reserved(super);
+			return RETERR(-ENOSPC);
+		}
+		return 0;
+	}
+
+	if (reiser4_grab_space(count, flags)) {
+		down(&sbinfo->delete_sema);
+		assert("nikita-2929", sbinfo->delete_sema_owner == NULL);
+		sbinfo->delete_sema_owner = current;
+
+		if (reiser4_grab_space(count, flags | BA_RESERVED)) {
+			warning("zam-833",
+				"reserved space is not enough (%llu)", (unsigned long long)count);
+			reiser4_release_reserved(super);
+			return RETERR(-ENOSPC);
+		}
+	}
+	return 0;
+}
+
+reiser4_internal void
+reiser4_release_reserved(struct super_block *super)
+{
+	reiser4_super_info_data *info;
+
+	info = get_super_private(super);
+	if (info->delete_sema_owner == current) {
+		info->delete_sema_owner = NULL;
+		up(&info->delete_sema);
+	}
+}
+
+static reiser4_super_info_data *
+grabbed2fake_allocated_head(void)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sub_from_ctx_grabbed(ctx, 1);
+
+	sbinfo = get_super_private(ctx->super);
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, 1);
+	/* return sbinfo locked */
+	return sbinfo;
+}
+
+/* is called after @count fake block numbers are allocated and pointer to
+   those blocks are inserted into tree. */
+static void
+grabbed2fake_allocated_formatted(void)
+{
+	reiser4_super_info_data *sbinfo;
+
+	sbinfo = grabbed2fake_allocated_head();
+	sbinfo->blocks_fake_allocated ++;
+
+	assert("vs-922", check_block_counters(reiser4_get_current_sb()));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+static void
+grabbed2fake_allocated_unformatted(void)
+{
+	reiser4_super_info_data *sbinfo;
+
+	sbinfo = grabbed2fake_allocated_head();
+	sbinfo->blocks_fake_allocated_unformatted ++;
+
+	assert("vs-9221", check_block_counters(reiser4_get_current_sb()));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+reiser4_internal void
+grabbed2cluster_reserved(int count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sub_from_ctx_grabbed(ctx, count);
+
+	sbinfo = get_super_private(ctx->super);
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, count);
+	sbinfo->blocks_clustered += count;
+
+	assert("edward-504", check_block_counters(ctx->super));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+reiser4_internal void
+cluster_reserved2grabbed(int count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+
+	sbinfo = get_super_private(ctx->super);
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_cluster_reserved(sbinfo, count);
+	sbinfo->blocks_grabbed += count;
+
+	assert("edward-505", check_block_counters(ctx->super));
+
+	reiser4_spin_unlock_sb(sbinfo);
+	ctx->grabbed_blocks += count;
+}
+
+reiser4_internal void
+cluster_reserved2free(int count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	assert("edward-503", get_current_context()->grabbed_blocks == 0);
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_cluster_reserved(sbinfo, count);
+	sbinfo->blocks_free += count;
+
+	assert("edward-502", check_block_counters(ctx->super));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+static spinlock_t fake_lock = SPIN_LOCK_UNLOCKED;
+static reiser4_block_nr fake_gen = 0;
+
+/* obtain a block number for new formatted node which will be used to refer
+   to this newly allocated node until real allocation is done */
+static inline void assign_fake_blocknr(reiser4_block_nr *blocknr)
+{
+	spin_lock(&fake_lock);
+	*blocknr = fake_gen++;
+	spin_unlock(&fake_lock);
+
+	*blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;
+	*blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
+	assert("zam-394", zlook(current_tree, blocknr) == NULL);
+}
+
+reiser4_internal int
+assign_fake_blocknr_formatted(reiser4_block_nr *blocknr)
+{
+	ON_TRACE(TRACE_RESERVE, "assign_fake_blocknr_formatted: moving 1 grabbed block to fake allocated formatted\n");
+
+	assign_fake_blocknr(blocknr);
+	grabbed2fake_allocated_formatted();
+
+	return 0;
+}
+
+/* return fake blocknr which will be used for unformatted nodes */
+reiser4_internal reiser4_block_nr
+fake_blocknr_unformatted(void)
+{
+	reiser4_block_nr blocknr;
+
+	ON_TRACE(TRACE_RESERVE, "fake_blocknr_unformatted: moving 1 grabbed block to fake allocated unformatted\n");
+
+	assign_fake_blocknr(&blocknr);
+	grabbed2fake_allocated_unformatted();
+
+	/*XXXXX*/inc_unalloc_unfm_ptr();
+	return blocknr;
+}
+
+
+/* adjust sb block counters, if real (on-disk) block allocation immediately
+   follows grabbing of free disk space. */
+static void
+grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count)
+{
+	sub_from_ctx_grabbed(ctx, count);
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, count);
+	sbinfo->blocks_used += count;
+
+	assert("nikita-2679", check_block_counters(ctx->super));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
+static void
+fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags)
+{
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_sb_fake_allocated(sbinfo, count, flags);
+	sbinfo->blocks_used += count;
+
+	assert("nikita-2680", check_block_counters(reiser4_get_current_sb()));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+static void
+flush_reserved2used(txn_atom * atom, __u64 count)
+{
+	reiser4_super_info_data *sbinfo;
+
+	assert("zam-787", atom != NULL);
+	assert("zam-788", spin_atom_is_locked(atom));
+
+	sub_from_atom_flush_reserved_nolock(atom, (__u32)count);
+
+	sbinfo = get_current_super_private();
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_sb_flush_reserved(sbinfo, count);
+	sbinfo->blocks_used += count;
+
+	assert ("zam-789", check_block_counters(reiser4_get_current_sb()));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+/* update the per fs  blocknr hint default value. */
+reiser4_internal void
+update_blocknr_hint_default (const struct super_block *s, const reiser4_block_nr * block)
+{
+	reiser4_super_info_data *sbinfo = get_super_private(s);
+
+	assert("nikita-3342", !blocknr_is_fake(block));
+
+	reiser4_spin_lock_sb(sbinfo);
+	if (*block < sbinfo->block_count) {
+		sbinfo->blocknr_hint_default = *block;
+	} else {
+		warning("zam-676",
+			"block number %llu is too large to be used in a blocknr hint\n", (unsigned long long) *block);
+		dump_stack();
+		DEBUGON(1);
+	}
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+/* get current value of the default blocknr hint. */
+reiser4_internal void get_blocknr_hint_default(reiser4_block_nr * result)
+{
+	reiser4_super_info_data * sbinfo = get_current_super_private();
+
+	reiser4_spin_lock_sb(sbinfo);
+	*result = sbinfo->blocknr_hint_default;
+	assert("zam-677", *result < sbinfo->block_count);
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+/* Allocate "real" disk blocks by calling a proper space allocation plugin
+ * method. Blocks are allocated in one contiguous disk region. The plugin
+ * independent part accounts blocks by subtracting allocated amount from grabbed
+ * or fake block counter and add the same amount to the counter of allocated
+ * blocks.
+ *
+ * @hint -- a reiser4 blocknr hint object which contains further block
+ *          allocation hints and parameters (search start, a stage of block
+ *          which will be mapped to disk, etc.),
+ * @blk  -- an out parameter for the beginning of the allocated region,
+ * @len  -- in/out parameter, it should contain the maximum number of allocated
+ *          blocks, after block allocation completes, it contains the length of
+ *          allocated disk region.
+ * @flags -- see reiser4_ba_flags_t description.
+ *
+ * @return -- 0 if success, error code otherwise.
+ */
+reiser4_internal int
+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
+		     reiser4_block_nr * len, reiser4_ba_flags_t flags)
+{
+	__u64 needed = *len;
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+	int ret;
+
+	assert ("zam-986", hint != NULL);
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	ON_TRACE(TRACE_RESERVE, "reiser4_alloc_blocks: needed %llu..", needed);
+
+	assert("vpf-339", hint != NULL);
+
+	ON_TRACE(TRACE_ALLOC,
+		 "alloc_blocks: requested %llu, search from %llu\n",
+		 (unsigned long long) *len, (unsigned long long) (hint ? hint->blk : ~0ull));
+
+	/* For write-optimized data we use default search start value, which is
+	 * close to last write location. */
+	if (flags & BA_USE_DEFAULT_SEARCH_START) {
+		reiser4_stat_inc(block_alloc.nohint);
+		get_blocknr_hint_default(&hint->blk);
+	}
+
+	/* VITALY: allocator should grab this for internal/tx-lists/similar only. */
+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
+	if (hint->block_stage == BLOCK_NOT_COUNTED) {
+		ret = reiser4_grab_space_force(*len, flags);
+		if (ret != 0)
+			return ret;
+	}
+
+	ret = sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int) needed, blk, len);
+
+	if (!ret) {
+		assert("zam-680", *blk < reiser4_block_count(ctx->super));
+		assert("zam-681", *blk + *len <= reiser4_block_count(ctx->super));
+
+		if (flags & BA_PERMANENT) {
+			/* we assume that current atom exists at this moment */
+			txn_atom * atom = get_current_atom_locked ();
+			atom -> nr_blocks_allocated += *len;
+			UNLOCK_ATOM (atom);
+		}
+
+		switch (hint->block_stage) {
+		case BLOCK_NOT_COUNTED:
+		case BLOCK_GRABBED:
+			ON_TRACE(TRACE_RESERVE, "ok. %llu blocks grabbed to used.\n", *len);
+			grabbed2used(ctx, sbinfo, *len);
+			break;
+		case BLOCK_UNALLOCATED:
+			ON_TRACE(TRACE_RESERVE, "ok. %llu blocks fake allocated to used.\n", *len);
+			fake_allocated2used(sbinfo, *len, flags);
+			break;
+		case BLOCK_FLUSH_RESERVED:
+			ON_TRACE(TRACE_RESERVE, "ok. %llu flush reserved to used (get wandered?)\n", *len);
+			{
+				txn_atom * atom = get_current_atom_locked ();
+				flush_reserved2used(atom, *len);
+				UNLOCK_ATOM (atom);
+			}
+			break;
+		default:
+			impossible("zam-531", "wrong block stage");
+		}
+	} else {
+		assert ("zam-821", ergo(hint->max_dist == 0 && !hint->backward, ret != -ENOSPC));
+		if (hint->block_stage == BLOCK_NOT_COUNTED)
+			grabbed2free(ctx, sbinfo, needed);
+	}
+
+	return ret;
+}
+
+/* used -> fake_allocated -> grabbed -> free */
+
+/* adjust sb block counters when @count unallocated blocks get unmapped from
+   disk */
+static void
+used2fake_allocated(reiser4_super_info_data *sbinfo, __u64 count, int formatted)
+{
+	reiser4_spin_lock_sb(sbinfo);
+
+	if (formatted)
+		sbinfo->blocks_fake_allocated += count;
+	else
+		sbinfo->blocks_fake_allocated_unformatted += count;
+
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+static void
+used2flush_reserved(reiser4_super_info_data *sbinfo, txn_atom * atom, __u64 count,
+		    reiser4_ba_flags_t flags UNUSED_ARG)
+{
+	assert("nikita-2791", atom != NULL);
+	assert("nikita-2792", spin_atom_is_locked(atom));
+
+	add_to_atom_flush_reserved_nolock(atom, (__u32)count);
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	sbinfo->blocks_flush_reserved += count;
+	/*add_to_sb_flush_reserved(sbinfo, count);*/
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
+static void
+fake_allocated2grabbed(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags)
+{
+	ctx->grabbed_blocks += count;
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	assert("nikita-2682", check_block_counters(ctx->super));
+
+	sbinfo->blocks_grabbed += count;
+	sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
+
+	assert("nikita-2683", check_block_counters(ctx->super));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+reiser4_internal void
+fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	ON_TRACE(TRACE_RESERVE, "fake_allocated2free %llu blocks\n", count);
+
+	fake_allocated2grabbed(ctx, sbinfo, count, flags);
+	grabbed2free(ctx, sbinfo, count);
+}
+
+reiser4_internal void
+grabbed2free_mark(__u64 mark)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	assert("nikita-3007", (__s64)mark >= 0);
+	assert("nikita-3006",
+	       ctx->grabbed_blocks >= mark);
+	grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
+}
+
+/* Adjust free blocks count for blocks which were reserved but were not used. */
+reiser4_internal void
+grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
+	       __u64 count)
+{
+	ON_TRACE(TRACE_RESERVE, "grabbed2free: %llu\n", count);
+
+	sub_from_ctx_grabbed(ctx, count);
+
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, count);
+	sbinfo->blocks_free += count;
+	assert("nikita-2684", check_block_counters(ctx->super));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+reiser4_internal void
+grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	assert("vs-1095", atom);
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	sub_from_ctx_grabbed(ctx, count);
+
+	add_to_atom_flush_reserved_nolock(atom, count);
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	sbinfo->blocks_flush_reserved += count;
+	sub_from_sb_grabbed(sbinfo, count);
+
+	assert ("vpf-292", check_block_counters(ctx->super));
+
+	ON_TRACE(TRACE_RESERVE, "__grabbed2flush_reserved_nolock %llu blocks: atom %u has %llu flush reserved blocks\n",
+		 count, atom->atom_id, atom->flush_reserved);
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+reiser4_internal void
+grabbed2flush_reserved(__u64 count)
+{
+	txn_atom * atom = get_current_atom_locked ();
+
+	ON_TRACE(TRACE_RESERVE, "__grabbed2flush_reserved\n");
+
+	grabbed2flush_reserved_nolock (atom, count);
+
+	UNLOCK_ATOM (atom);
+}
+
+reiser4_internal void flush_reserved2grabbed(txn_atom * atom, __u64 count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	assert("nikita-2788", atom != NULL);
+	assert("nikita-2789", spin_atom_is_locked(atom));
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	ctx->grabbed_blocks += count;
+
+	sub_from_atom_flush_reserved_nolock(atom, (__u32)count);
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	sbinfo->blocks_grabbed += count;
+	sub_from_sb_flush_reserved(sbinfo, count);
+
+	assert ("vpf-292", check_block_counters (ctx->super));
+
+	reiser4_spin_unlock_sb (sbinfo);
+}
+
+/* release all blocks grabbed in context which where not used. */
+reiser4_internal void
+all_grabbed2free(void)
+{
+	reiser4_context *ctx = get_current_context();
+
+	grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
+}
+
+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
+   after freeing, @count blocks become "grabbed". */
+static void
+used2grabbed(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count)
+{
+	ctx->grabbed_blocks += count;
+
+	reiser4_spin_lock_sb(sbinfo);
+
+	sbinfo->blocks_grabbed += count;
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2685", check_block_counters(ctx->super));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+/* this used to be done through used2grabbed and grabbed2free*/
+static void
+used2free(reiser4_super_info_data *sbinfo, __u64 count)
+{
+	reiser4_spin_lock_sb(sbinfo);
+
+	sbinfo->blocks_free += count;
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2685", check_block_counters(reiser4_get_current_sb()));
+
+	reiser4_spin_unlock_sb(sbinfo);
+}
+
+#if REISER4_DEBUG
+
+/* check "allocated" state of given block range */
+void
+reiser4_check_blocks(const reiser4_block_nr * start, const reiser4_block_nr * len, int desired)
+{
+	sa_check_blocks(start, len, desired);
+}
+
+/* check "allocated" state of given block */
+void
+reiser4_check_block(const reiser4_block_nr * block, int desired)
+{
+	const reiser4_block_nr one = 1;
+
+	reiser4_check_blocks(block, &one, desired);
+}
+
+#endif
+
+/* Blocks deallocation function may do an actual deallocation through space
+   plugin allocation or store deleted block numbers in atom's delete_set data
+   structure depend on @defer parameter. */
+
+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
+   will be deleted from WORKING bitmap. They might be just unmapped from disk, or
+   freed but disk space is still grabbed by current thread, or these blocks must
+   not be counted in any reiser4 sb block counters, see block_stage_t comment */
+
+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
+   distinguish blocks allocated for unformatted and formatted nodes */
+
+reiser4_internal int
+reiser4_dealloc_blocks(const reiser4_block_nr * start,
+		       const reiser4_block_nr * len,
+		       block_stage_t target_stage, reiser4_ba_flags_t flags)
+{
+	txn_atom *atom = NULL;
+	int ret;
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ON_TRACE(TRACE_RESERVE, "reiser4_dealloc_blocks: %llu blocks", *len);
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	if (REISER4_DEBUG) {
+		assert("zam-431", *len != 0);
+		assert("zam-432", *start != 0);
+		assert("zam-558", !blocknr_is_fake(start));
+
+		reiser4_spin_lock_sb(sbinfo);
+		assert("zam-562", *start < sbinfo->block_count);
+		reiser4_spin_unlock_sb(sbinfo);
+	}
+
+	if (flags & BA_DEFER) {
+		blocknr_set_entry *bsep = NULL;
+
+		ON_TRACE(TRACE_RESERVE, "put on delete set\n");
+
+		/* storing deleted block numbers in a blocknr set
+		   datastructure for further actual deletion */
+		do {
+			atom = get_current_atom_locked();
+			assert("zam-430", atom != NULL);
+
+			ret = blocknr_set_add_extent(atom, &atom->delete_set, &bsep, start, len);
+
+			if (ret == -ENOMEM)
+				return ret;
+
+			/* This loop might spin at most two times */
+		} while (ret == -E_REPEAT);
+
+		assert("zam-477", ret == 0);
+		assert("zam-433", atom != NULL);
+
+		UNLOCK_ATOM(atom);
+
+	} else {
+		assert("zam-425", get_current_super_private() != NULL);
+		sa_dealloc_blocks(get_space_allocator(ctx->super), *start, *len);
+
+		if (flags & BA_PERMANENT) {
+			/* These blocks were counted as allocated, we have to revert it
+			 * back if allocation is discarded. */
+			txn_atom * atom = get_current_atom_locked ();
+			atom->nr_blocks_allocated -= *len;
+			UNLOCK_ATOM (atom);
+		}
+
+		switch (target_stage) {
+		case BLOCK_NOT_COUNTED:
+			assert("vs-960", flags & BA_FORMATTED);
+
+			ON_TRACE(TRACE_RESERVE, "moved from used to free\n");
+
+			/* VITALY: This is what was grabbed for internal/tx-lists/similar only */
+			used2free(sbinfo, *len);
+			break;
+
+		case BLOCK_GRABBED:
+
+			ON_TRACE(TRACE_RESERVE, "moved from used to grabbed\n");
+
+			used2grabbed(ctx, sbinfo, *len);
+			break;
+
+		case BLOCK_UNALLOCATED:
+
+			ON_TRACE(TRACE_RESERVE, "moved from used to fake allocated\n");
+
+			used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
+			break;
+
+		case BLOCK_FLUSH_RESERVED: {
+			txn_atom *atom;
+
+			ON_TRACE(TRACE_RESERVE, "moved from used to flush reserved\n");
+
+			atom = get_current_atom_locked();
+			used2flush_reserved(sbinfo, atom, *len, flags & BA_FORMATTED);
+			UNLOCK_ATOM(atom);
+			break;
+		}
+		default:
+			impossible("zam-532", "wrong block stage");
+		}
+	}
+
+	return 0;
+}
+
+/* wrappers for block allocator plugin methods */
+reiser4_internal int
+pre_commit_hook(void)
+{
+	assert("zam-502", get_current_super_private() != NULL);
+	sa_pre_commit_hook();
+	return 0;
+}
+
+/* an actor which applies delete set to block allocator data */
+static int
+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, const reiser4_block_nr * b, void *data UNUSED_ARG)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	__u64 len = 1;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
+	assert("zam-552", sbinfo != NULL);
+
+	if (b != NULL)
+		len = *b;
+
+	if (REISER4_DEBUG) {
+		reiser4_spin_lock_sb(sbinfo);
+
+		assert("zam-554", *a < reiser4_block_count(ctx->super));
+		assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
+
+		reiser4_spin_unlock_sb(sbinfo);
+	}
+
+	sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
+	/* adjust sb block counters */
+	used2free(sbinfo, len);
+	return 0;
+}
+
+reiser4_internal void
+post_commit_hook(void)
+{
+	txn_atom *atom;
+
+	atom = get_current_atom_locked();
+	assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
+	UNLOCK_ATOM(atom);
+
+	/* do the block deallocation which was deferred
+	   until commit is done */
+	blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
+
+	assert("zam-504", get_current_super_private() != NULL);
+	sa_post_commit_hook();
+}
+
+reiser4_internal void
+post_write_back_hook(void)
+{
+	assert("zam-504", get_current_super_private() != NULL);
+
+	sa_post_commit_hook();
+}
+
+/*
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/block_alloc.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/block_alloc.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/block_alloc.h	2004-08-22 19:35:33.609654610 +1000
@@ -0,0 +1,185 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
+#define __FS_REISER4_BLOCK_ALLOC_H__
+
+#include "dformat.h"
+#include "forward.h"
+
+#include <linux/types.h>	/* for __u??  */
+#include <linux/fs.h>
+
+/* Mask when is applied to given block number shows is that block number is a fake one */
+#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
+/* Mask which isolates a type of object this fake block number was assigned to */
+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
+
+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
+   against these two values to understand is the object unallocated or bitmap
+   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
+#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
+
+/* specification how block allocation was counted in sb block counters */
+typedef enum {
+	BLOCK_NOT_COUNTED	= 0,	/* reiser4 has no info about this block yet */
+	BLOCK_GRABBED		= 1,	/* free space grabbed for further allocation
+					   of this block */
+	BLOCK_FLUSH_RESERVED	= 2,	/* block is reserved for flush needs. */
+	BLOCK_UNALLOCATED	= 3,	/* block is used for existing in-memory object
+					   ( unallocated formatted or unformatted
+					   node) */
+	BLOCK_ALLOCATED		= 4	/* block is mapped to disk, real on-disk block
+					   number assigned */
+} block_stage_t;
+
+/* a hint for block allocator */
+struct reiser4_blocknr_hint {
+	/* FIXME: I think we want to add a longterm lock on the bitmap block here.  This
+	   is to prevent jnode_flush() calls from interleaving allocations on the same
+	   bitmap, once a hint is established. */
+
+	/* search start hint */
+	reiser4_block_nr blk;
+	/* if not zero, it is a region size we search for free blocks in */
+	reiser4_block_nr max_dist;
+	/* level for allocation, may be useful have branch-level and higher
+	   write-optimized. */
+	tree_level level;
+	/* block allocator assumes that blocks, which will be mapped to disk,
+	   are in this specified block_stage */
+	block_stage_t block_stage;
+	/* If direction = 1 allocate blocks in backward direction from the end
+	 * of disk to the beginning of disk.  */
+	int backward:1;
+
+};
+
+/* These flags control block allocation/deallocation behavior */
+enum reiser4_ba_flags {
+	/* do allocatations from reserved (5%) area */
+	BA_RESERVED	    = (1 << 0),
+
+	/* block allocator can do commit trying to recover free space */
+	BA_CAN_COMMIT	    = (1 << 1),
+
+	/* if operation will be applied to formatted block */
+	BA_FORMATTED	    = (1 << 2),
+
+	/* defer actual block freeing until transaction commit */
+	BA_DEFER	    = (1 << 3),
+
+	/* allocate blocks for permanent fs objects (formatted or unformatted), not
+	   wandered of log blocks */
+	BA_PERMANENT        = (1 << 4),
+
+	/* grab space even it was disabled */
+	BA_FORCE            = (1 << 5),
+
+	/* use default start value for free blocks search. */
+	BA_USE_DEFAULT_SEARCH_START = (1 << 6)
+};
+
+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
+
+extern void blocknr_hint_init(reiser4_blocknr_hint * hint);
+extern void blocknr_hint_done(reiser4_blocknr_hint * hint);
+extern void update_blocknr_hint_default(const struct super_block *, const reiser4_block_nr *);
+extern void get_blocknr_hint_default(reiser4_block_nr *);
+
+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block * super);
+
+int assign_fake_blocknr_formatted(reiser4_block_nr *);
+reiser4_block_nr fake_blocknr_unformatted(void);
+
+
+/* free -> grabbed -> fake_allocated -> used */
+
+
+int  reiser4_grab_space           (__u64 count, reiser4_ba_flags_t flags);
+void all_grabbed2free             (void);
+void grabbed2free                 (reiser4_context *,
+				   reiser4_super_info_data *, __u64 count);
+void fake_allocated2free          (__u64 count, reiser4_ba_flags_t flags);
+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
+void grabbed2flush_reserved       (__u64 count);
+int  reiser4_alloc_blocks         (reiser4_blocknr_hint * hint,
+				   reiser4_block_nr * start,
+				   reiser4_block_nr * len,
+				   reiser4_ba_flags_t flags);
+int reiser4_dealloc_blocks        (const reiser4_block_nr *,
+				   const reiser4_block_nr *,
+				   block_stage_t, reiser4_ba_flags_t flags);
+
+static inline int reiser4_alloc_block (reiser4_blocknr_hint * hint, reiser4_block_nr * start,
+				       reiser4_ba_flags_t flags)
+{
+	reiser4_block_nr one = 1;
+	return reiser4_alloc_blocks(hint, start, &one, flags);
+}
+
+static inline int reiser4_dealloc_block (const reiser4_block_nr * block, block_stage_t stage, reiser4_ba_flags_t flags)
+{
+	const reiser4_block_nr one = 1;
+	return reiser4_dealloc_blocks(block, &one, stage, flags);
+}
+
+#define reiser4_grab_space_force(count, flags)		\
+	reiser4_grab_space(count, flags | BA_FORCE)
+
+extern void grabbed2free_mark(__u64 mark);
+extern int  reiser4_grab_reserved(struct super_block *,
+				  __u64, reiser4_ba_flags_t);
+extern void reiser4_release_reserved(struct super_block *super);
+
+/* grabbed -> fake_allocated */
+
+/* fake_allocated -> used */
+
+/* used -> fake_allocated -> grabbed -> free */
+
+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
+
+extern int blocknr_is_fake(const reiser4_block_nr * da);
+
+extern void grabbed2cluster_reserved(int count);
+extern void cluster_reserved2grabbed(int count);
+extern void cluster_reserved2free(int count);
+
+extern int check_block_counters(const struct super_block *);
+
+#if REISER4_DEBUG
+
+extern void reiser4_check_blocks(const reiser4_block_nr *, const reiser4_block_nr *, int);
+extern void reiser4_check_block(const reiser4_block_nr *, int);
+
+#else
+
+#  define reiser4_check_blocks(beg, len, val)  noop
+#  define reiser4_check_block(beg, val)        noop
+
+#endif
+
+#if REISER4_DEBUG_OUTPUT
+extern void print_block_counters(const char *,
+				 const struct super_block *,
+				 txn_atom *atom);
+#else
+#define print_block_counters(p, s, a) noop
+#endif
+
+extern int pre_commit_hook(void);
+extern void post_commit_hook(void);
+extern void post_write_back_hook(void);
+
+#endif				/* __FS_REISER4_BLOCK_ALLOC_H__ */
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/blocknrset.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/blocknrset.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/blocknrset.c	2004-08-22 19:35:33.609654610 +1000
@@ -0,0 +1,365 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* This file contains code for various block number sets used by the atom to
+   track the deleted set and wandered block mappings. */
+
+#include "debug.h"
+#include "dformat.h"
+#include "type_safe_list.h"
+#include "txnmgr.h"
+
+#include <linux/slab.h>
+
+/* The proposed data structure for storing unordered block number sets is a
+   list of elements, each of which contains an array of block number or/and
+   array of block number pairs. That element called blocknr_set_entry is used
+   to store block numbers from the beginning and for extents from the end of
+   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
+   count numbers of blocks and extents.
+
+   +------------------- blocknr_set_entry->data ------------------+
+   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
+   +------------------------------------------------------------+
+
+   When current blocknr_set_entry is full, allocate a new one. */
+
+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
+ * set (single blocks and block extents), in that case blocknr pair represent an
+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
+ * there represent a (real block) -> (wandered block) mapping. */
+
+typedef struct blocknr_pair blocknr_pair;
+
+/* The total size of a blocknr_set_entry. */
+#define BLOCKNR_SET_ENTRY_SIZE 128
+
+/* The number of blocks that can fit the blocknr data area. */
+#define BLOCKNR_SET_ENTRIES_NUMBER               \
+       ((BLOCKNR_SET_ENTRY_SIZE -           \
+         2 * sizeof (unsigned) -            \
+         sizeof (blocknr_set_list_link)) /  \
+        sizeof (reiser4_block_nr))
+
+/* An entry of the blocknr_set */
+struct blocknr_set_entry {
+	unsigned nr_singles;
+	unsigned nr_pairs;
+	blocknr_set_list_link link;
+	reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
+};
+
+/* A pair of blocks as recorded in the blocknr_set_entry data. */
+struct blocknr_pair {
+	reiser4_block_nr a;
+	reiser4_block_nr b;
+};
+
+/* The list definition. */
+TYPE_SAFE_LIST_DEFINE(blocknr_set, blocknr_set_entry, link);
+
+/* Return the number of blocknr slots available in a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static unsigned
+bse_avail(blocknr_set_entry * bse)
+{
+	unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
+
+	assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
+	cassert(sizeof (blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
+
+	return BLOCKNR_SET_ENTRIES_NUMBER - used;
+}
+
+/* Initialize a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static void
+bse_init(blocknr_set_entry * bse)
+{
+	bse->nr_singles = 0;
+	bse->nr_pairs = 0;
+	blocknr_set_list_clean(bse);
+}
+
+/* Allocate and initialize a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static blocknr_set_entry *
+bse_alloc(void)
+{
+	blocknr_set_entry *e;
+
+	if ((e = (blocknr_set_entry *) kmalloc(sizeof (blocknr_set_entry), GFP_KERNEL)) == NULL) {
+		return NULL;
+	}
+
+	bse_init(e);
+
+	return e;
+}
+
+/* Free a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static void
+bse_free(blocknr_set_entry * bse)
+{
+	kfree(bse);
+}
+
+/* Add a block number to a blocknr_set_entry */
+/* Audited by: green(2002.06.11) */
+static void
+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
+{
+	assert("jmacd-5099", bse_avail(bse) >= 1);
+
+	bse->entries[bse->nr_singles++] = *block;
+}
+
+/* Get a pair of block numbers */
+/* Audited by: green(2002.06.11) */
+static inline blocknr_pair *
+bse_get_pair(blocknr_set_entry * bse, unsigned pno)
+{
+	assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
+
+	return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER - 2 * (pno + 1));
+}
+
+/* Add a pair of block numbers to a blocknr_set_entry */
+/* Audited by: green(2002.06.11) */
+static void
+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a, const reiser4_block_nr * b)
+{
+	blocknr_pair *pair;
+
+	assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
+
+	pair = bse_get_pair(bse, bse->nr_pairs++);
+
+	pair->a = *a;
+	pair->b = *b;
+}
+
+/* Add either a block or pair of blocks to the block number set.  The first
+   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
+   @b is non-NULL a pair is added.  The block number set belongs to atom, and
+   the call is made with the atom lock held.  There may not be enough space in
+   the current blocknr_set_entry.  If new_bsep points to a non-NULL
+   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
+   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
+   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
+   returned with the atom unlocked for the operation to be tried again.  If
+   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
+   used during the call, it will be freed automatically. */
+/* Audited by: green(2002.06.11) */
+static int
+blocknr_set_add(txn_atom * atom,
+		blocknr_set * bset,
+		blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, const reiser4_block_nr * b)
+{
+	blocknr_set_entry *bse;
+	unsigned entries_needed;
+
+	assert("jmacd-5101", a != NULL);
+
+	entries_needed = (b == NULL) ? 1 : 2;
+	if (blocknr_set_list_empty(&bset->entries) || bse_avail(blocknr_set_list_front(&bset->entries))
+	    < entries_needed) {
+		/* See if a bse was previously allocated. */
+		if (*new_bsep == NULL) {
+			UNLOCK_ATOM(atom);
+			*new_bsep = bse_alloc();
+			return (*new_bsep != NULL) ? -E_REPEAT : RETERR(-ENOMEM);
+		}
+
+		/* Put it on the head of the list. */
+		blocknr_set_list_push_front(&bset->entries, *new_bsep);
+
+		*new_bsep = NULL;
+	}
+
+	/* Add the single or pair. */
+	bse = blocknr_set_list_front(&bset->entries);
+	if (b == NULL) {
+		bse_put_single(bse, a);
+	} else {
+		bse_put_pair(bse, a, b);
+	}
+
+	/* If new_bsep is non-NULL then there was an allocation race, free this copy. */
+	if (*new_bsep != NULL) {
+		bse_free(*new_bsep);
+		*new_bsep = NULL;
+	}
+
+	return 0;
+}
+
+/* Add an extent to the block set.  If the length is 1, it is treated as a
+   single block (e.g., reiser4_set_add_block). */
+/* Audited by: green(2002.06.11) */
+/* Auditor note: Entire call chain cannot hold any spinlocks, because
+   kmalloc might schedule. The only exception is atom spinlock, which is
+   properly freed. */
+reiser4_internal int
+blocknr_set_add_extent(txn_atom * atom,
+		       blocknr_set * bset,
+		       blocknr_set_entry ** new_bsep, const reiser4_block_nr * start, const reiser4_block_nr * len)
+{
+	assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
+	return blocknr_set_add(atom, bset, new_bsep, start, *len == 1 ? NULL : len);
+}
+
+/* Add a block pair to the block set. It adds exactly a pair, which is checked
+ * by an assertion that both arguments are not null.*/
+/* Audited by: green(2002.06.11) */
+/* Auditor note: Entire call chain cannot hold any spinlocks, because
+   kmalloc might schedule. The only exception is atom spinlock, which is
+   properly freed. */
+reiser4_internal int
+blocknr_set_add_pair(txn_atom * atom,
+		     blocknr_set * bset,
+		     blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, const reiser4_block_nr * b)
+{
+	assert("jmacd-5103", a != NULL && b != NULL);
+	return blocknr_set_add(atom, bset, new_bsep, a, b);
+}
+
+/* Initialize a blocknr_set. */
+/* Audited by: green(2002.06.11) */
+reiser4_internal void
+blocknr_set_init(blocknr_set * bset)
+{
+	blocknr_set_list_init(&bset->entries);
+}
+
+/* Release the entries of a blocknr_set. */
+/* Audited by: green(2002.06.11) */
+reiser4_internal void
+blocknr_set_destroy(blocknr_set * bset)
+{
+	while (!blocknr_set_list_empty(&bset->entries)) {
+		bse_free(blocknr_set_list_pop_front(&bset->entries));
+	}
+}
+
+/* Merge blocknr_set entries out of @from into @into. */
+/* Audited by: green(2002.06.11) */
+/* Auditor comments: This merge does not know if merged sets contain
+   blocks pairs (As for wandered sets) or extents, so it cannot really merge
+   overlapping ranges if there is some. So I believe it may lead to
+   some blocks being presented several times in one blocknr_set. To help
+   debugging such problems it might help to check for duplicate entries on
+   actual processing of this set. Testing this kind of stuff right here is
+   also complicated by the fact that these sets are not sorted and going
+   through whole set on each element addition is going to be CPU-heavy task */
+reiser4_internal void
+blocknr_set_merge(blocknr_set * from, blocknr_set * into)
+{
+	blocknr_set_entry *bse_into = NULL;
+
+	/* If @from is empty, no work to perform. */
+	if (blocknr_set_list_empty(&from->entries)) {
+		return;
+	}
+
+	/* If @into is not empty, try merging partial-entries. */
+	if (!blocknr_set_list_empty(&into->entries)) {
+
+		/* Neither set is empty, pop the front to members and try to combine them. */
+		blocknr_set_entry *bse_from;
+		unsigned into_avail;
+
+		bse_into = blocknr_set_list_pop_front(&into->entries);
+		bse_from = blocknr_set_list_pop_front(&from->entries);
+
+		/* Combine singles. */
+		for (into_avail = bse_avail(bse_into); into_avail != 0 && bse_from->nr_singles != 0; into_avail -= 1) {
+			bse_put_single(bse_into, &bse_from->entries[--bse_from->nr_singles]);
+		}
+
+		/* Combine pairs. */
+		for (; into_avail > 1 && bse_from->nr_pairs != 0; into_avail -= 2) {
+			blocknr_pair *pair = bse_get_pair(bse_from, --bse_from->nr_pairs);
+			bse_put_pair(bse_into, &pair->a, &pair->b);
+		}
+
+		/* If bse_from is empty, delete it now. */
+		if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
+			bse_free(bse_from);
+		} else {
+			/* Otherwise, bse_into is full or nearly full (e.g.,
+			   it could have one slot avail and bse_from has one
+			   pair left).  Push it back onto the list.  bse_from
+			   becomes bse_into, which will be the new partial. */
+			blocknr_set_list_push_front(&into->entries, bse_into);
+			bse_into = bse_from;
+		}
+	}
+
+	/* Splice lists together. */
+	blocknr_set_list_splice(&into->entries, &from->entries);
+
+	/* Add the partial entry back to the head of the list. */
+	if (bse_into != NULL) {
+		blocknr_set_list_push_front(&into->entries, bse_into);
+	}
+}
+
+/* Iterate over all blocknr set elements. */
+reiser4_internal int
+blocknr_set_iterator(txn_atom * atom, blocknr_set * bset, blocknr_set_actor_f actor, void *data, int delete)
+{
+
+	blocknr_set_entry *entry;
+
+	assert("zam-429", atom != NULL);
+	assert("zam-430", atom_is_protected(atom));
+	assert("zam-431", bset != 0);
+	assert("zam-432", actor != NULL);
+
+	entry = blocknr_set_list_front(&bset->entries);
+	while (!blocknr_set_list_end(&bset->entries, entry)) {
+		blocknr_set_entry *tmp = blocknr_set_list_next(entry);
+		unsigned int i;
+		int ret;
+
+		for (i = 0; i < entry->nr_singles; i++) {
+			ret = actor(atom, &entry->entries[i], NULL, data);
+
+			/* We can't break a loop if delete flag is set. */
+			if (ret != 0 && !delete)
+				return ret;
+		}
+
+		for (i = 0; i < entry->nr_pairs; i++) {
+			struct blocknr_pair *ab;
+
+			ab = bse_get_pair(entry, i);
+
+			ret = actor(atom, &ab->a, &ab->b, data);
+
+			if (ret != 0 && !delete)
+				return ret;
+		}
+
+		if (delete) {
+			blocknr_set_list_remove(entry);
+			bse_free(entry);
+		}
+
+		entry = tmp;
+	}
+
+	return 0;
+}
+
+/*
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/bufmgr/wander.txt
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/bufmgr/wander.txt	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/bufmgr/wander.txt	2004-08-22 19:35:33.610654450 +1000
@@ -0,0 +1,184 @@
+
+Before discussing the format of the commit record occupying the
+journal area, we must revisit the topic of free space bitmap
+management.  At the time an atom is closing and formatting its commit
+record, the question is how to deallocate the blocks deleted by the
+atom.  Those blocks become free once the atom commits, but they cannot
+be re-allocated before that point in time.
+
+Modified bitmaps are always part of the overwrite set, meaning copies
+are written to wandered positions (i.e., part of the log) before later
+being overwritten.
+
+We have defined these terms:
+
+WORKING BITMAPS: the "current" in-memory bitmaps
+
+COMMIT BITMAPS: bitmap copies written to wandered, overwrite positions
+
+DELETE SET: the set of deleted blocks plus the set of former positions
+of relocated blocks.  These block positions are deallocated when the
+atom commits.
+
+WANDERED SET: the set of temporary locations used to store overwrite
+blocks before they are actually overwritten.  These block positions
+are deallocated some time after the atom commits, when it is ensured
+that the atom will no longer replay during crash recovery.
+
+Both the delete set and the wandered set are blocks to be deleted, but
+the details of handling these deletions are necessarily different.
+
+---- Consider first the handling of the DELETE SET.
+
+There are two ways to handle the delete set.  Before reading their
+descriptions, let me offer my opinion.  The first is MORE complicated
+but requires LESS data to be logged in the commit record.  The second
+is LESS complicated but requires MORE data to be logged in the commit
+record.
+
+Strategy #1: MORE COMPLICATED, LESS LOGGED DATA
+
+  At the time an atom closes, it creates a snapshot of all the
+  modified bitmaps.  In other words, it creates commit bitmaps which
+  are copies of the working bitmaps.  The delete set are immediately
+  deallocated in the commit bitmaps, which are written to their
+  wandered positions and later overwritten in their actual positions.
+
+  This way, the commit record does not contain any record of the
+  delete set.
+
+  But there are problems with this approach, too.  First, there is
+  extra memory pressure associated with maintaining extra copies of
+  modified bitmaps.  Second, it is less straight forward than it may
+  appear at first.  Suppose there are two atoms that commit in
+  sequence, such that the first does not complete its commit (i.e.,
+  finish all the required writes) before the second prepares to
+  commit.  Which bitmaps does the second committing atom copy as its
+  commit bitmaps?  It does not just copy the working bitmaps, since
+  those do not yet represent the first atom deallocations.
+
+  Instead, it looks like we would end up maintaining multiple copies
+  of every bitmap.  Each atom's commit bitmaps are the commit bitmaps
+  of the previous atom plus whatever modifications were made by the
+  atom itself.  This means in addition to maintaining the working
+  bitmaps, we end up maintaining separate commit bitmaps.  It is not
+  just as simple as copying the working bitmaps at the time of commit.
+
+  This solution looks far too complicated to me.  I admit that I have
+  not fully tried to understand the complexity, but I do not think the
+  advantages (smaller commit records) will outweigh the additional
+  complexity, not to mention the additional memory pressure.
+
+Strategy #2: LESS COMPLICATED, MORE LOGGED DATA
+
+  In this solution, the commit bitmaps are the same as the working
+  bitmaps--no copies are made.  We commit the working bitmaps without
+  deallocating the delete set and we include the delete set in the
+  commit record instead.
+
+  Before I describe exactly how deallocation works in this case, let
+  me add that there is another reason why this method is preferred.
+  The wandered set has to be deleted after the atom commits, since it
+  does not become available until the atom will no longer be
+  replayed.  With this approach to freeing the delete set, both kinds
+  of deletion can be handled in the same manner, since they both take
+  place after the atom commits.
+
+  In other words, since we have to address deallocating the wandered
+  set after commit anyway, we might as well use the same mechanism for
+  deallocating the delete set.  It means that additional data is
+  logged, but it reduces complexity in my opinion.
+
+  Here's how it works.  The atom stores a record of its delete set in
+  memory.  When a block is deallocated or relocated, the bit is of
+  course not immediately deallocated in the working bitmaps.
+
+  The delete set is included in the commit record, which is written to
+  the journal area.  The delete set is just a set of block numbers, so
+  there are several possible representations.  The implementation
+  could actually dynamically chose the representation to achieve the
+  best compression: (a) list of blocks, (b) bitmap, and (c) extent
+  compression.  The second two options are likely to achieve
+  significant compression of the delete set unless fragmentation
+  becomes a problem.
+
+  The atom maintains its in-memory copy of the delete set until the
+  commit record is flushed to the disk.  At this point, those blocks
+  become available for new atoms to re-allocate.  The atom releases
+  these blocks back into the working bitmaps through the process of
+  "reposession".  The reposession process makes a younger atom
+  responsible for committing a deallocation from a previous atom.
+
+  For each block in the committed atom's delete set, a younger atom is
+  selected (or created) to handle the deallocation of that block.  The
+  working bitmap corresponding to the block being deleted is or was
+  already captured by the younger (reposessing) atom.  The block is
+  simply marked as deallocated in the working bitmap block captured.
+
+  The reposessing atom may immediately use this block or not, but in
+  either case the deallocation is committed once the reposessing atom
+  commits.  For recovery purposes (not discussed here), each atom also
+  includes a list of atoms for which it resposesses.
+
+---- The commit record
+
+The commit record includes three lists:
+
+  DELETE SET: The set of blocks deallocated by this atom, represented
+  as either a list, bitmap, or using extents.
+
+  WANDER SET: A list of block-pairs giving the original location and
+  the temporary wandered location.  During replay the temporary
+  location is copied to the original location.  After replay is no
+  longer needed, the temporary locations are deallocated using
+  reposession as previously described.
+
+  REPOSESSES FOR SET: A list of the previous atoms for which this atom
+  reposesses deallocated blocks.  This is used to know which atoms
+  deallocations must be replayed during crash recovery.
+
+I propose that all of this information is included in the commit
+record, which is written to the journal area.  There may be multiple
+journal areas (a significant complication) or there may not, but the
+key point is that all of this data is written into a reserved,
+cyclical journal area.  Because the journal area is reserved and
+written in a simple cyclical manner, there are no allocation decisions
+needed to find space for these commit records.
+
+---- The example
+
+Consider a roughly 50G file being modified in a 100G file system.
+Realize that due to maintaining the preserve set, it is not possible
+to transactionally write a file larger than 50G on a 100G file system.
+In the absolute worst case, no extent compression is possible and the
+best representation of the delete set requires a bitmap covering the
+entire file system.
+
+A 100G file system with 4K blocks has 3.27MB of bitmaps, and this is
+the same as the worst-case representation of the delete set, assuming
+just about every other block is deleted.  In reality, we expect the
+delete set to be much smaller because extent-compression would achieve
+significant savings.
+
+The wander set could possibly be compressed, but that is a more
+difficult task.  Suppose we attempt to overwrite the entire 50GB file
+instead of relocating it.  A 50G file has 13 million blocks, therefore
+the wander set requires storing 26 million block address pairs.  With
+8-byte block addresses that requires writing 210MB of wander set
+data.  Ouch!
+
+We should hope that the size of the wander set does not grow so large.
+After all, its parent the extent record must be modified in this case,
+so these blocks are all candidates for relocation.  It would take a
+dumb allocate/flush plugin to try to overwrite a 50G file instead of
+relocating it.
+
+---- The conclusion
+
+I maintain that it is much simpler to write all of this data inside
+reserved log areas.  It is possible that we could write this data
+outside the log, but then it will complicate the allocation and
+deallocation proceedure, since space for the log itself must then be
+allocated using ordinary methods.
+
+Comments?
Index: linux-2.6.8.1-ck/fs/reiser4/carry.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/carry.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/carry.c	2004-08-22 19:35:33.612654132 +1000
@@ -0,0 +1,1437 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* Functions to "carry" tree modification(s) upward. */
+/* Tree is modified one level at a time. As we modify a level we accumulate a
+   set of changes that need to be propagated to the next level.  We manage
+   node locking such that any searches that collide with carrying are
+   restarted, from the root if necessary.
+
+   Insertion of a new item may result in items being moved among nodes and
+   this requires the delimiting key to be updated at the least common parent
+   of the nodes modified to preserve search tree invariants. Also, insertion
+   may require allocation of a new node. A pointer to the new node has to be
+   inserted into some node on the parent level, etc.
+
+   Tree carrying is meant to be analogous to arithmetic carrying.
+
+   A carry operation is always associated with some node (&carry_node).
+
+   Carry process starts with some initial set of operations to be performed
+   and an initial set of already locked nodes.  Operations are performed one
+   by one. Performing each single operation has following possible effects:
+
+    - content of carry node associated with operation is modified
+    - new carry nodes are locked and involved into carry process on this level
+    - new carry operations are posted to the next level
+
+   After all carry operations on this level are done, process is repeated for
+   the accumulated sequence on carry operations for the next level. This
+   starts by trying to lock (in left to right order) all carry nodes
+   associated with carry operations on the parent level. After this, we decide
+   whether more nodes are required on the left of already locked set. If so,
+   all locks taken on the parent level are released, new carry nodes are
+   added, and locking process repeats.
+
+   It may happen that balancing process fails owing to unrecoverable error on
+   some of upper levels of a tree (possible causes are io error, failure to
+   allocate new node, etc.). In this case we should unmount the filesystem,
+   rebooting if it is the root, and possibly advise the use of fsck.
+
+   USAGE:
+
+
+    int some_tree_operation( znode *node, ... )
+    {
+       // Allocate on a stack pool of carry objects: operations and nodes.
+       // Most carry processes will only take objects from here, without
+       // dynamic allocation.
+
+I feel uneasy about this pool.  It adds to code complexity, I understand why it exists, but.... -Hans
+
+       carry_pool  pool;
+       carry_level lowest_level;
+       carry_op   *op;
+
+       init_carry_pool( &pool );
+       init_carry_level( &lowest_level, &pool );
+
+       // operation may be one of:
+       //   COP_INSERT    --- insert new item into node
+       //   COP_CUT       --- remove part of or whole node
+       //   COP_PASTE     --- increase size of item
+       //   COP_DELETE    --- delete pointer from parent node
+       //   COP_UPDATE    --- update delimiting key in least
+       //                     common ancestor of two
+
+       op = post_carry( &lowest_level, operation, node, 0 );
+       if( IS_ERR( op ) || ( op == NULL ) ) {
+           handle error
+       } else {
+           // fill in remaining fields in @op, according to carry.h:carry_op
+           result = carry( &lowest_level, NULL );
+       }
+       done_carry_pool( &pool );
+    }
+
+   When you are implementing node plugin method that participates in carry
+   (shifting, insertion, deletion, etc.), do the following:
+
+   int foo_node_method( znode *node, ..., carry_level *todo )
+   {
+       carry_op   *op;
+
+       ....
+
+       // note, that last argument to post_carry() is non-null
+       // here, because @op is to be applied to the parent of @node, rather
+       // than to the @node itself as in the previous case.
+
+       op = node_post_carry( todo, operation, node, 1 );
+       // fill in remaining fields in @op, according to carry.h:carry_op
+
+       ....
+
+   }
+
+   BATCHING:
+
+   One of the main advantages of level-by-level balancing implemented here is
+   ability to batch updates on a parent level and to peform them more
+   efficiently as a result.
+
+   Description To Be Done (TBD).
+
+   DIFFICULTIES AND SUBTLE POINTS:
+
+   1. complex plumbing is required, because:
+
+       a. effective allocation through pools is needed
+
+       b. target of operation is not exactly known when operation is
+       posted. This is worked around through bitfields in &carry_node and
+       logic in lock_carry_node()
+
+       c. of interaction with locking code: node should be added into sibling
+       list when pointer to it is inserted into its parent, which is some time
+       after node was created. Between these moments, node is somewhat in
+       suspended state and is only registered in the carry lists
+
+    2. whole balancing logic is implemented here, in particular, insertion
+    logic is coded in make_space().
+
+    3. special cases like insertion (add_tree_root()) or deletion
+    (kill_tree_root()) of tree root and morphing of paste into insert
+    (insert_paste()) have to be handled.
+
+    4. there is non-trivial interdependency between allocation of new nodes
+    and almost everything else. This is mainly due to the (1.c) above. I shall
+    write about this later.
+
+*/
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/item/extent.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "znode.h"
+#include "tree_mod.h"
+#include "tree_walk.h"
+#include "block_alloc.h"
+#include "pool.h"
+#include "tree.h"
+#include "carry.h"
+#include "carry_ops.h"
+#include "super.h"
+#include "reiser4.h"
+#include "prof.h"
+
+#include <linux/types.h>
+
+/* level locking/unlocking */
+static int lock_carry_level(carry_level * level);
+static void unlock_carry_level(carry_level * level, int failure);
+static void done_carry_level(carry_level * level);
+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
+
+int lock_carry_node(carry_level * level, carry_node * node);
+int lock_carry_node_tail(carry_node * node);
+
+/* carry processing proper */
+static int carry_on_level(carry_level * doing, carry_level * todo);
+
+/* handlers for carry operations. */
+
+static void fatal_carry_error(carry_level * doing, int ecode);
+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
+
+static int carry_estimate_reserve(carry_level * level);
+
+#if REISER4_DEBUG
+typedef enum {
+	CARRY_TODO,
+	CARRY_DOING
+} carry_queue_state;
+static int carry_level_invariant(carry_level * level, carry_queue_state state);
+#endif
+
+/* main entry point for tree balancing.
+
+   Tree carry performs operations from @doing and while doing so accumulates
+   information about operations to be performed on the next level ("carried"
+   to the parent level). Carried operations are performed, causing possibly
+   more operations to be carried upward etc. carry() takes care about
+   locking and pinning znodes while operating on them.
+
+   For usage, see comment at the top of fs/reiser4/carry.c
+
+*/
+reiser4_internal int
+carry(carry_level * doing /* set of carry operations to be performed */ ,
+      carry_level * done	/* set of nodes, already performed at the
+				 * previous level. NULL in most cases */ )
+{
+	int result = 0;
+	carry_level done_area;
+	carry_level todo_area;
+	/* queue of new requests */
+	carry_level *todo;
+	int wasreserved;
+	int reserve;
+	ON_DEBUG(STORE_COUNTERS;)
+
+	assert("nikita-888", doing != NULL);
+
+	trace_stamp(TRACE_CARRY);
+
+	todo = &todo_area;
+	init_carry_level(todo, doing->pool);
+	if (done == NULL) {
+		/* queue of requests performed on the previous level */
+		done = &done_area;
+		init_carry_level(done, doing->pool);
+	}
+
+	wasreserved = perthread_pages_count();
+	reserve = carry_estimate_reserve(doing);
+	result = perthread_pages_reserve(reserve, GFP_KERNEL);
+	if (result != 0)
+		return result;
+
+	/* iterate until there is nothing more to do */
+	while (result == 0 && carry_op_num(doing) > 0) {
+		carry_level *tmp;
+
+		ON_STATS(todo->level_no = doing->level_no + 1);
+
+		/* at this point @done is locked. */
+		/* repeat lock/do/unlock while
+
+		   (1) lock_carry_level() fails due to deadlock avoidance, or
+
+		   (2) carry_on_level() decides that more nodes have to
+		   be involved.
+
+		   (3) some unexpected error occurred while balancing on the
+		   upper levels. In this case all changes are rolled back.
+
+		*/
+		while (1) {
+			result = lock_carry_level(doing);
+			if (result == 0) {
+				/* perform operations from @doing and
+				   accumulate new requests in @todo */
+				result = carry_on_level(doing, todo);
+				if (result == 0)
+					break;
+				else if (result != -E_REPEAT ||
+					 !doing->restartable) {
+					warning("nikita-1043",
+						"Fatal error during carry: %i",
+						result);
+					print_level("done", done);
+					print_level("doing", doing);
+					print_level("todo", todo);
+					/* do some rough stuff like aborting
+					   all pending transcrashes and thus
+					   pushing tree back to the consistent
+					   state. Alternatvely, just panic.
+					*/
+					fatal_carry_error(doing, result);
+					return result;
+				}
+			} else if (result != -E_REPEAT) {
+				fatal_carry_error(doing, result);
+				return result;
+			}
+			reiser4_stat_level_inc(doing, carry_restart);
+			unlock_carry_level(doing, 1);
+		}
+		/* at this point @done can be safely unlocked */
+		done_carry_level(done);
+		reiser4_stat_level_inc(doing, carry_done);
+		/* cyclically shift queues */
+		tmp = done;
+		done = doing;
+		doing = todo;
+		todo = tmp;
+		init_carry_level(todo, doing->pool);
+
+		/* give other threads chance to run */
+		preempt_point();
+	}
+	done_carry_level(done);
+
+	assert("nikita-3460", perthread_pages_count() - wasreserved >= 0);
+	perthread_pages_release(perthread_pages_count() - wasreserved);
+
+	/* all counters, but x_refs should remain the same. x_refs can change
+	   owing to transaction manager */
+	ON_DEBUG(CHECK_COUNTERS;)
+	return result;
+}
+
+/* perform carry operations on given level.
+
+   Optimizations proposed by pooh:
+
+   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
+   required;
+
+   (2) unlock node if there are no more operations to be performed upon it and
+   node didn't add any operation to @todo. This can be implemented by
+   attaching to each node two counters: counter of operaions working on this
+   node and counter and operations carried upward from this node.
+
+*/
+static int
+carry_on_level(carry_level * doing	/* queue of carry operations to
+					 * do on this level */ ,
+	       carry_level * todo	/* queue where new carry
+					 * operations to be performed on
+					 * the * parent level are
+					 * accumulated during @doing
+					 * processing. */ )
+{
+	int result;
+	int (*f) (carry_op *, carry_level *, carry_level *);
+	carry_op *op;
+	carry_op *tmp_op;
+
+	assert("nikita-1034", doing != NULL);
+	assert("nikita-1035", todo != NULL);
+
+	trace_stamp(TRACE_CARRY);
+
+	/* node can be inconsistent while in-transit */
+	DISABLE_NODE_CHECK;
+
+	/* @doing->nodes are locked. */
+
+	/* This function can be split into two phases: analysis and modification.
+
+	   Analysis calculates precisely what items should be moved between
+	   nodes. This information is gathered in some structures attached to
+	   each carry_node in a @doing queue. Analysis also determines whether
+	   new nodes are to be allocated etc.
+
+	   After analysis is completed, actual modification is performed. Here
+	   we can take advantage of "batch modification": if there are several
+	   operations acting on the same node, modifications can be performed
+	   more efficiently when batched together.
+
+	   Above is an optimization left for the future.
+	*/
+	/* Important, but delayed optimization: it's possible to batch
+	   operations together and perform them more efficiently as a
+	   result. For example, deletion of several neighboring items from a
+	   node can be converted to a single ->cut() operation.
+
+	   Before processing queue, it should be scanned and "mergeable"
+	   operations merged.
+	*/
+	result = 0;
+	for_all_ops(doing, op, tmp_op) {
+		carry_opcode opcode;
+
+		assert("nikita-1041", op != NULL);
+		opcode = op->op;
+		assert("nikita-1042", op->op < COP_LAST_OP);
+		f = op_dispatch_table[op->op].handler;
+		result = f(op, doing, todo);
+		/* locking can fail with -E_REPEAT. Any different error is fatal
+		   and will be handled by fatal_carry_error() sledgehammer.
+		*/
+		if (result != 0)
+			break;
+	}
+	if (result == 0) {
+		carry_plugin_info info;
+		carry_node *scan;
+		carry_node *tmp_scan;
+
+		info.doing = doing;
+		info.todo = todo;
+
+		assert("nikita-3002", carry_level_invariant(doing, CARRY_DOING));
+		for_all_nodes(doing, scan, tmp_scan) {
+			znode *node;
+
+			node = carry_real(scan);
+			assert("nikita-2547", node != NULL);
+			if (node_is_empty(node)) {
+				result = node_plugin_by_node(node)->prepare_removal(node, &info);
+				if (result != 0)
+					break;
+			}
+		}
+	}
+	ENABLE_NODE_CHECK;
+	return result;
+}
+
+/* post carry operation
+
+   This is main function used by external carry clients: node layout plugins
+   and tree operations to create new carry operation to be performed on some
+   level.
+
+   New operation will be included in the @level queue. To actually perform it,
+   call carry( level, ... ). This function takes write lock on @node. Carry
+   manages all its locks by itself, don't worry about this.
+
+   This function adds operation and node at the end of the queue. It is up to
+   caller to guarantee proper ordering of node queue.
+
+*/
+reiser4_internal carry_op *
+post_carry(carry_level * level	/* queue where new operation is to
+				 * be posted at */ ,
+	   carry_opcode op /* opcode of operation */ ,
+	   znode * node		/* node on which this operation
+				 * will operate */ ,
+	   int apply_to_parent_p	/* whether operation will operate
+					 * directly on @node or on it
+					 * parent. */ )
+{
+	carry_op *result;
+	carry_node *child;
+
+	assert("nikita-1046", level != NULL);
+	assert("nikita-1788", znode_is_write_locked(node));
+
+	result = add_op(level, POOLO_LAST, NULL);
+	if (IS_ERR(result))
+		return result;
+	child = add_carry(level, POOLO_LAST, NULL);
+	if (IS_ERR(child)) {
+		reiser4_pool_free(&level->pool->op_pool, &result->header);
+		return (carry_op *) child;
+	}
+	result->node = child;
+	result->op = op;
+	child->parent = apply_to_parent_p;
+	if (ZF_ISSET(node, JNODE_ORPHAN))
+		child->left_before = 1;
+	child->node = node;
+	return result;
+}
+
+/* number of carry operations in a @level */
+reiser4_internal int
+carry_op_num(const carry_level * level)
+{
+	return level->ops_num;
+}
+
+/* initialise carry queue */
+reiser4_internal void
+init_carry_level(carry_level * level /* level to initialise */ ,
+		 carry_pool * pool	/* pool @level will allocate objects
+					 * from */ )
+{
+	assert("nikita-1045", level != NULL);
+	assert("nikita-967", pool != NULL);
+
+	xmemset(level, 0, sizeof *level);
+	level->pool = pool;
+
+	pool_level_list_init(&level->nodes);
+	pool_level_list_init(&level->ops);
+}
+
+/* initialise pools within queue */
+reiser4_internal void
+init_carry_pool(carry_pool * pool /* pool to initialise */ )
+{
+	assert("nikita-945", pool != NULL);
+
+	reiser4_init_pool(&pool->op_pool, sizeof (carry_op), CARRIES_POOL_SIZE, (char *) pool->op);
+	reiser4_init_pool(&pool->node_pool, sizeof (carry_node), NODES_LOCKED_POOL_SIZE, (char *) pool->node);
+}
+
+/* finish with queue pools */
+reiser4_internal void
+done_carry_pool(carry_pool * pool UNUSED_ARG /* pool to destroy */ )
+{
+	reiser4_done_pool(&pool->op_pool);
+	reiser4_done_pool(&pool->node_pool);
+}
+
+/* add new carry node to the @level.
+
+   Returns pointer to the new carry node allocated from pool.  It's up to
+   callers to maintain proper order in the @level. Assumption is that if carry
+   nodes on one level are already sorted and modifications are peroformed from
+   left to right, carry nodes added on the parent level will be ordered
+   automatically. To control ordering use @order and @reference parameters.
+
+*/
+reiser4_internal carry_node *
+add_carry_skip(carry_level * level	/* &carry_level to add node
+					 * to */ ,
+	       pool_ordering order	/* where to insert: at the
+					 * beginning of @level,
+					 * before @reference, after
+					 * @reference, at the end
+					 * of @level */ ,
+	       carry_node * reference	/* reference node for
+					 * insertion */ )
+{
+	ON_DEBUG(carry_node * orig_ref = reference);
+
+	trace_stamp(TRACE_CARRY);
+	if (order == POOLO_BEFORE) {
+		reference = find_left_carry(reference, level);
+		if (reference == NULL)
+			reference = carry_node_front(level);
+		else
+			reference = carry_node_next(reference);
+	} else if (order == POOLO_AFTER) {
+		reference = find_right_carry(reference, level);
+		if (reference == NULL)
+			reference = carry_node_back(level);
+		else
+			reference = carry_node_prev(reference);
+	}
+	assert("nikita-2209",
+	       ergo(orig_ref != NULL,
+		    carry_real(reference) == carry_real(orig_ref)));
+	return add_carry(level, order, reference);
+}
+
+reiser4_internal carry_node *
+add_carry(carry_level * level	/* &carry_level to add node
+				 * to */ ,
+	  pool_ordering order	/* where to insert: at the
+				 * beginning of @level, before
+				 * @reference, after @reference,
+				 * at the end of @level */ ,
+	  carry_node * reference	/* reference node for
+					 * insertion */ )
+{
+	carry_node *result;
+
+	result = (carry_node *) add_obj(&level->pool->node_pool, &level->nodes, order, &reference->header);
+	if (!IS_ERR(result) && (result != NULL))
+		++level->nodes_num;
+	return result;
+}
+
+/* add new carry operation to the @level.
+
+   Returns pointer to the new carry operations allocated from pool. It's up to
+   callers to maintain proper order in the @level. To control ordering use
+   @order and @reference parameters.
+
+*/
+reiser4_internal carry_op *
+add_op(carry_level * level /* &carry_level to add node to */ ,
+       pool_ordering order	/* where to insert: at the beginning of
+				 * @level, before @reference, after
+				 * @reference, at the end of @level */ ,
+       carry_op * reference /* reference node for insertion */ )
+{
+	carry_op *result;
+
+	trace_stamp(TRACE_CARRY);
+	result = (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order, &reference->header);
+	if (!IS_ERR(result) && (result != NULL))
+		++level->ops_num;
+	return result;
+}
+
+/* Return node on the right of which @node was created.
+
+   Each node is created on the right of some existing node (or it is new root,
+   which is special case not handled here).
+
+   @node is new node created on some level, but not yet inserted into its
+   parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
+
+*/
+reiser4_internal carry_node *
+find_begetting_brother(carry_node * node	/* node to start search
+						 * from */ ,
+		       carry_level * kin UNUSED_ARG	/* level to
+							 * scan */ )
+{
+	carry_node *scan;
+
+	assert("nikita-1614", node != NULL);
+	assert("nikita-1615", kin != NULL);
+	assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
+	assert("nikita-1619", ergo(carry_real(node) != NULL,
+				   ZF_ISSET(carry_real(node), JNODE_ORPHAN)));
+
+	for (scan = node;; scan = carry_node_prev(scan)) {
+		assert("nikita-1617", !carry_node_end(kin, scan));
+		if ((scan->node != node->node) && !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
+			assert("nikita-1618", carry_real(scan) != NULL);
+			break;
+		}
+	}
+	return scan;
+}
+
+static cmp_t
+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
+{
+	assert("nikita-2199", n1 != NULL);
+	assert("nikita-2200", n2 != NULL);
+
+	if (n1 == n2)
+		return EQUAL_TO;
+	while (1) {
+		n1 = carry_node_next(n1);
+		if (carry_node_end(level, n1))
+			return GREATER_THAN;
+		if (n1 == n2)
+			return LESS_THAN;
+	}
+	impossible("nikita-2201", "End of level reached");
+}
+
+reiser4_internal carry_node *
+find_carry_node(carry_level * level, const znode * node)
+{
+	carry_node *scan;
+	carry_node *tmp_scan;
+
+	assert("nikita-2202", level != NULL);
+	assert("nikita-2203", node != NULL);
+
+	for_all_nodes(level, scan, tmp_scan) {
+		if (carry_real(scan) == node)
+			return scan;
+	}
+	return NULL;
+}
+
+reiser4_internal znode *
+carry_real(const carry_node * node)
+{
+	assert("nikita-3061", node != NULL);
+
+	return node->lock_handle.node;
+}
+
+reiser4_internal carry_node *
+insert_carry_node(carry_level * doing, carry_level * todo, const znode * node)
+{
+	carry_node *base;
+	carry_node *scan;
+	carry_node *tmp_scan;
+	carry_node *proj;
+
+	base = find_carry_node(doing, node);
+	assert("nikita-2204", base != NULL);
+
+	for_all_nodes(todo, scan, tmp_scan) {
+		proj = find_carry_node(doing, scan->node);
+		assert("nikita-2205", proj != NULL);
+		if (carry_node_cmp(doing, proj, base) != LESS_THAN)
+			break;
+	}
+	return scan;
+}
+
+reiser4_internal carry_node *
+add_carry_atplace(carry_level *doing, carry_level *todo, znode *node)
+{
+	carry_node *reference;
+
+	assert("nikita-2994", doing != NULL);
+	assert("nikita-2995", todo != NULL);
+	assert("nikita-2996", node != NULL);
+
+	reference = insert_carry_node(doing, todo, node);
+	assert("nikita-2997", reference != NULL);
+
+	return add_carry(todo, POOLO_BEFORE, reference);
+}
+
+/* like post_carry(), but designed to be called from node plugin methods.
+   This function is different from post_carry() in that it finds proper place
+   to insert node in the queue. */
+reiser4_internal carry_op *
+node_post_carry(carry_plugin_info * info	/* carry parameters
+						 * passed down to node
+						 * plugin */ ,
+		carry_opcode op /* opcode of operation */ ,
+		znode * node	/* node on which this
+				 * operation will operate */ ,
+		int apply_to_parent_p	/* whether operation will
+					 * operate directly on @node
+					 * or on it parent. */ )
+{
+	carry_op *result;
+	carry_node *child;
+
+	assert("nikita-2207", info != NULL);
+	assert("nikita-2208", info->todo != NULL);
+
+	if (info->doing == NULL)
+		return post_carry(info->todo, op, node, apply_to_parent_p);
+
+	result = add_op(info->todo, POOLO_LAST, NULL);
+	if (IS_ERR(result))
+		return result;
+	child = add_carry_atplace(info->doing, info->todo, node);
+	if (IS_ERR(child)) {
+		reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
+		return (carry_op *) child;
+	}
+	result->node = child;
+	result->op = op;
+	child->parent = apply_to_parent_p;
+	if (ZF_ISSET(node, JNODE_ORPHAN))
+		child->left_before = 1;
+	child->node = node;
+	return result;
+}
+
+/* lock all carry nodes in @level */
+static int
+lock_carry_level(carry_level * level /* level to lock */ )
+{
+	int result;
+	carry_node *node;
+	carry_node *tmp_node;
+
+	assert("nikita-881", level != NULL);
+	assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
+
+	trace_stamp(TRACE_CARRY);
+
+	/* lock nodes from left to right */
+	result = 0;
+	for_all_nodes(level, node, tmp_node) {
+		result = lock_carry_node(level, node);
+		if (result != 0)
+			break;
+	}
+	return result;
+}
+
+/* Synchronize delimiting keys between @node and its left neighbor.
+
+   To reduce contention on dk key and simplify carry code, we synchronize
+   delimiting keys only when carry ultimately leaves tree level (carrying
+   changes upward) and unlocks nodes at this level.
+
+   This function first finds left neighbor of @node and then updates left
+   neighbor's right delimiting key to conincide with least key in @node.
+
+*/
+static void
+sync_dkeys(znode *spot /* node to update */)
+{
+	reiser4_key pivot;
+	reiser4_tree *tree;
+
+	assert("nikita-1610", spot != NULL);
+	assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
+
+	tree = znode_get_tree(spot);
+	WLOCK_DK(tree);
+
+	assert("nikita-2192", znode_is_loaded(spot));
+
+	/* sync left delimiting key of @spot with key in its leftmost item */
+	if (node_is_empty(spot))
+		pivot = *znode_get_rd_key(spot);
+	else
+		leftmost_key_in_node(spot, &pivot);
+
+	znode_set_ld_key(spot, &pivot);
+
+	RLOCK_TREE(tree);
+	/* there can be sequence of empty nodes pending removal on the left of
+	   @spot. Scan them and update their left and right delimiting keys to
+	   match left delimiting key of @spot. Also, update right delimiting
+	   key of first non-empty left neighbor.
+	*/
+	while (1) {
+		if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
+			break;
+
+		spot = spot->left;
+		if (spot == NULL)
+			break;
+
+#if 0
+		/* on the leaf level we can only increase right delimiting key
+		 * of a node on which we don't hold a long term lock. */
+		assert("nikita-2930",
+		       ergo(!znode_is_write_locked(spot) &&
+			    znode_get_level(spot) == LEAF_LEVEL,
+			    keyge(&pivot, znode_get_rd_key(spot))));
+#endif
+
+		znode_set_rd_key(spot, &pivot);
+		/* don't sink into the domain of another balancing */
+		if (!znode_is_write_locked(spot))
+			break;
+		if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
+			znode_set_ld_key(spot, &pivot);
+		else
+			break;
+	}
+
+	RUNLOCK_TREE(tree);
+	WUNLOCK_DK(tree);
+}
+
+void
+check_dkeys(const znode *node);
+
+/* unlock all carry nodes in @level */
+static void
+unlock_carry_level(carry_level * level /* level to unlock */ ,
+		   int failure	/* true if unlocking owing to
+				 * failure */ )
+{
+	carry_node *node;
+	carry_node *tmp_node;
+
+	assert("nikita-889", level != NULL);
+
+	trace_stamp(TRACE_CARRY);
+
+	if (!failure) {
+		znode *spot;
+
+		spot = NULL;
+		/* update delimiting keys */
+		for_all_nodes(level, node, tmp_node) {
+			if (carry_real(node) != spot) {
+				spot = carry_real(node);
+				sync_dkeys(spot);
+			}
+		}
+	}
+
+	/* nodes can be unlocked in arbitrary order.  In preemptible
+	   environment it's better to unlock in reverse order of locking,
+	   though.
+	*/
+	for_all_nodes_back(level, node, tmp_node) {
+		/* all allocated nodes should be already linked to their
+		   parents at this moment. */
+		assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node),
+							       JNODE_ORPHAN)));
+		if (!failure)
+			node_check(carry_real(node), REISER4_NODE_DKEYS);
+		ON_DEBUG(check_dkeys(carry_real(node)));
+		unlock_carry_node(level, node, failure);
+	}
+	level->new_root = NULL;
+}
+
+/* finish with @level
+
+   Unlock nodes and release all allocated resources */
+static void
+done_carry_level(carry_level * level /* level to finish */ )
+{
+	carry_node *node;
+	carry_node *tmp_node;
+	carry_op *op;
+	carry_op *tmp_op;
+
+	assert("nikita-1076", level != NULL);
+
+	trace_stamp(TRACE_CARRY);
+
+	unlock_carry_level(level, 0);
+	for_all_nodes(level, node, tmp_node) {
+		assert("nikita-2113", locks_list_is_clean(&node->lock_handle));
+		assert("nikita-2114", owners_list_is_clean(&node->lock_handle));
+		reiser4_pool_free(&level->pool->node_pool, &node->header);
+	}
+	for_all_ops(level, op, tmp_op)
+	    reiser4_pool_free(&level->pool->op_pool, &op->header);
+}
+
+/* helper function to complete locking of carry node
+
+   Finish locking of carry node. There are several ways in which new carry
+   node can be added into carry level and locked. Normal is through
+   lock_carry_node(), but also from find_{left|right}_neighbor(). This
+   function factors out common final part of all locking scenarios. It
+   supposes that @node -> lock_handle is lock handle for lock just taken and
+   fills ->real_node from this lock handle.
+
+*/
+reiser4_internal int
+lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
+{
+	assert("nikita-1052", node != NULL);
+	assert("nikita-1187", carry_real(node) != NULL);
+	assert("nikita-1188", !node->unlock);
+
+	node->unlock = 1;
+	/* Load node content into memory and install node plugin by
+	   looking at the node header.
+
+	   Most of the time this call is cheap because the node is
+	   already in memory.
+
+	   Corresponding zrelse() is in unlock_carry_node()
+	*/
+	return zload(carry_real(node));
+}
+
+/* lock carry node
+
+   "Resolve" node to real znode, lock it and mark as locked.
+   This requires recursive locking of znodes.
+
+   When operation is posted to the parent level, node it will be applied to is
+   not yet known. For example, when shifting data between two nodes,
+   delimiting has to be updated in parent or parents of nodes involved. But
+   their parents is not yet locked and, moreover said nodes can be reparented
+   by concurrent balancing.
+
+   To work around this, carry operation is applied to special "carry node"
+   rather than to the znode itself. Carry node consists of some "base" or
+   "reference" znode and flags indicating how to get to the target of carry
+   operation (->real_node field of carry_node) from base.
+
+*/
+reiser4_internal int
+lock_carry_node(carry_level * level /* level @node is in */ ,
+		carry_node * node /* node to lock */ )
+{
+	int result;
+	znode *reference_point;
+	lock_handle lh;
+	lock_handle tmp_lh;
+
+	assert("nikita-887", level != NULL);
+	assert("nikita-882", node != NULL);
+
+	trace_stamp(TRACE_CARRY);
+
+	result = 0;
+	reference_point = node->node;
+	init_lh(&lh);
+	init_lh(&tmp_lh);
+	if (node->left_before) {
+		/* handling of new nodes, allocated on the previous level:
+
+		   some carry ops were propably posted from the new node, but
+		   this node neither has parent pointer set, nor is
+		   connected. This will be done in ->create_hook() for
+		   internal item.
+
+		   No then less, parent of new node has to be locked. To do
+		   this, first go to the "left" in the carry order. This
+		   depends on the decision to always allocate new node on the
+		   right of existing one.
+
+		   Loop handles case when multiple nodes, all orphans, were
+		   inserted.
+
+		   Strictly speaking, taking tree lock is not necessary here,
+		   because all nodes scanned by loop in
+		   find_begetting_brother() are write-locked by this thread,
+		   and thus, their sibling linkage cannot change.
+
+		*/
+		reference_point = UNDER_RW
+		    (tree, znode_get_tree(reference_point), read,
+		     find_begetting_brother(node, level)->node);
+		assert("nikita-1186", reference_point != NULL);
+	}
+	if (node->parent && (result == 0)) {
+		result = reiser4_get_parent(&tmp_lh, reference_point, ZNODE_WRITE_LOCK, 0);
+		if (result != 0) {
+			;	/* nothing */
+		} else if (znode_get_level(tmp_lh.node) == 0) {
+			assert("nikita-1347", znode_above_root(tmp_lh.node));
+			result = add_new_root(level, node, tmp_lh.node);
+			if (result == 0) {
+				reference_point = level->new_root;
+				move_lh(&lh, &node->lock_handle);
+			}
+		} else if ((level->new_root != NULL) && (level->new_root != znode_parent_nolock(reference_point))) {
+			/* parent of node exists, but this level aready
+			   created different new root, so */
+			warning("nikita-1109",
+				/* it should be "radicis", but tradition is
+				   tradition.  do banshees read latin? */
+				"hodie natus est radici frater");
+			result = -EIO;
+		} else {
+			move_lh(&lh, &tmp_lh);
+			reference_point = lh.node;
+		}
+	}
+	if (node->left && (result == 0)) {
+		assert("nikita-1183", node->parent);
+		assert("nikita-883", reference_point != NULL);
+		result = reiser4_get_left_neighbor(
+			&tmp_lh, reference_point, ZNODE_WRITE_LOCK, GN_CAN_USE_UPPER_LEVELS);
+		if (result == 0) {
+			done_lh(&lh);
+			move_lh(&lh, &tmp_lh);
+			reference_point = lh.node;
+		}
+	}
+	if (!node->parent && !node->left && !node->left_before) {
+		result = longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
+	}
+	if (result == 0) {
+		move_lh(&node->lock_handle, &lh);
+		result = lock_carry_node_tail(node);
+	}
+	done_lh(&tmp_lh);
+	done_lh(&lh);
+	return result;
+}
+
+/* release a lock on &carry_node.
+
+   Release if necessary lock on @node. This opearion is pair of
+   lock_carry_node() and is idempotent: you can call it more than once on the
+   same node.
+
+*/
+static void
+unlock_carry_node(carry_level * level,
+		  carry_node * node /* node to be released */ ,
+		  int failure	/* 0 if node is unlocked due
+				 * to some error */ )
+{
+	znode *real_node;
+
+	assert("nikita-884", node != NULL);
+
+	trace_stamp(TRACE_CARRY);
+
+	real_node = carry_real(node);
+	/* pair to zload() in lock_carry_node_tail() */
+	zrelse(real_node);
+	if (node->unlock && (real_node != NULL)) {
+		assert("nikita-899", real_node == node->lock_handle.node);
+		longterm_unlock_znode(&node->lock_handle);
+	}
+	if (failure) {
+		if (node->deallocate && (real_node != NULL)) {
+			/* free node in bitmap
+
+			   Prepare node for removal. Last zput() will finish
+			   with it.
+			*/
+			ZF_SET(real_node, JNODE_HEARD_BANSHEE);
+		}
+		if (node->free) {
+			assert("nikita-2177", locks_list_is_clean(&node->lock_handle));
+			assert("nikita-2112", owners_list_is_clean(&node->lock_handle));
+			reiser4_pool_free(&level->pool->node_pool, &node->header);
+		}
+	}
+}
+
+/* fatal_carry_error() - all-catching error handling function
+
+   It is possible that carry faces unrecoverable error, like unability to
+   insert pointer at the internal level. Our simple solution is just panic in
+   this situation. More sophisticated things like attempt to remount
+   file-system as read-only can be implemented without much difficlties.
+
+   It is believed, that:
+
+   1. in stead of panicking, all current transactions can be aborted rolling
+   system back to the consistent state.
+
+Umm, if you simply panic without doing anything more at all, then all current
+transactions are aborted and the system is rolled back to a consistent state,
+by virtue of the design of the transactional mechanism. Well, wait, let's be
+precise.  If an internal node is corrupted on disk due to hardware failure,
+then there may be no consistent state that can be rolled back to, so instead
+we should say that it will rollback the transactions, which barring other
+factors means rolling back to a consistent state.
+
+# Nikita: there is a subtle difference between panic and aborting
+# transactions: machine doesn't reboot. Processes aren't killed. Processes
+# don't using reiser4 (not that we care about such processes), or using other
+# reiser4 mounts (about them we do care) will simply continue to run. With
+# some luck, even application using aborted file system can survive: it will
+# get some error, like EBADF, from each file descriptor on failed file system,
+# but applications that do care about tolerance will cope with this (squid
+# will).
+
+It would be a nice feature though to support rollback without rebooting
+followed by remount, but this can wait for later versions.
+
+
+   2. once isolated transactions will be implemented it will be possible to
+   roll back offending transaction.
+
+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
+it more before deciding if it should be done.  -Hans
+
+*/
+static void
+fatal_carry_error(carry_level * doing UNUSED_ARG	/* carry level
+							 * where
+							 * unrecoverable
+							 * error
+							 * occurred */ ,
+		  int ecode /* error code */ )
+{
+	assert("nikita-1230", doing != NULL);
+	assert("nikita-1231", ecode < 0);
+
+	reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
+}
+
+/* add new root to the tree
+
+   This function itself only manages changes in carry structures and delegates
+   all hard work (allocation of znode for new root, changes of parent and
+   sibling pointers to the add_tree_root().
+
+   Locking: old tree root is locked by carry at this point. Fake znode is also
+   locked.
+
+*/
+static int
+add_new_root(carry_level * level	/* carry level in context of which
+					 * operation is performed */ ,
+	     carry_node * node /* carry node for existing root */ ,
+	     znode * fake	/* "fake" znode already locked by
+				 * us */ )
+{
+	int result;
+
+	assert("nikita-1104", level != NULL);
+	assert("nikita-1105", node != NULL);
+
+	assert("nikita-1403", znode_is_write_locked(node->node));
+	assert("nikita-1404", znode_is_write_locked(fake));
+
+	/* trying to create new root. */
+	/* @node is root and it's already locked by us. This
+	   means that nobody else can be trying to add/remove
+	   tree root right now.
+	*/
+	if (level->new_root == NULL)
+		level->new_root = add_tree_root(node->node, fake);
+	if (!IS_ERR(level->new_root)) {
+		assert("nikita-1210", znode_is_root(level->new_root));
+		node->deallocate = 1;
+		result = longterm_lock_znode(&node->lock_handle, level->new_root, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
+		if (result == 0)
+			zput(level->new_root);
+	} else {
+		result = PTR_ERR(level->new_root);
+		level->new_root = NULL;
+	}
+	return result;
+}
+
+/* allocate new znode and add the operation that inserts the
+   pointer to it into the parent node into the todo level
+
+   Allocate new znode, add it into carry queue and post into @todo queue
+   request to add pointer to new node into its parent.
+
+   This is carry related routing that calls new_node() to allocate new
+   node.
+*/
+reiser4_internal carry_node *
+add_new_znode(znode * brother	/* existing left neighbor of new
+				 * node */ ,
+	      carry_node * ref	/* carry node after which new
+				 * carry node is to be inserted
+				 * into queue. This affects
+				 * locking. */ ,
+	      carry_level * doing	/* carry queue where new node is
+					 * to be added */ ,
+	      carry_level * todo	/* carry queue where COP_INSERT
+					 * operation to add pointer to
+					 * new node will ne added */ )
+{
+	carry_node *fresh;
+	znode *new_znode;
+	carry_op *add_pointer;
+	carry_plugin_info info;
+
+	assert("nikita-1048", brother != NULL);
+	assert("nikita-1049", todo != NULL);
+
+	/* There is a lot of possible variations here: to what parent
+	   new node will be attached and where. For simplicity, always
+	   do the following:
+
+	   (1) new node and @brother will have the same parent.
+
+	   (2) new node is added on the right of @brother
+
+	*/
+
+	fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref);
+	if (IS_ERR(fresh))
+		return fresh;
+
+	fresh->deallocate = 1;
+	fresh->free = 1;
+
+	new_znode = new_node(brother, znode_get_level(brother));
+	if (IS_ERR(new_znode))
+		/* @fresh will be deallocated automatically by error
+		   handling code in the caller. */
+		return (carry_node *) new_znode;
+
+	/* new_znode returned znode with x_count 1. Caller has to decrease
+	   it. make_space() does. */
+
+	ZF_SET(new_znode, JNODE_ORPHAN);
+	fresh->node = new_znode;
+
+	while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) {
+		ref = carry_node_prev(ref);
+		assert("nikita-1606", !carry_node_end(doing, ref));
+	}
+
+	info.todo = todo;
+	info.doing = doing;
+	add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1);
+	if (IS_ERR(add_pointer)) {
+		/* no need to deallocate @new_znode here: it will be
+		   deallocated during carry error handling. */
+		return (carry_node *) add_pointer;
+	}
+
+	add_pointer->u.insert.type = COPT_CHILD;
+	add_pointer->u.insert.child = fresh;
+	add_pointer->u.insert.brother = brother;
+	/* initially new node spawns empty key range */
+	WLOCK_DK(znode_get_tree(brother));
+	znode_set_ld_key(new_znode,
+			 znode_set_rd_key(new_znode, znode_get_rd_key(brother)));
+	WUNLOCK_DK(znode_get_tree(brother));
+	return fresh;
+}
+
+/*
+ * Estimate how many pages of memory have to be reserved to complete execution
+ * of @level.
+ */
+static int carry_estimate_reserve(carry_level * level)
+{
+	carry_op *op;
+	carry_op *tmp_op;
+	int result;
+
+	result = 0;
+	for_all_ops(level, op, tmp_op)
+		result += op_dispatch_table[op->op].estimate(op, level);
+	return result;
+}
+
+/* DEBUGGING FUNCTIONS.
+
+   Probably we also should leave them on even when
+   debugging is turned off to print dumps at errors.
+*/
+#if REISER4_DEBUG
+static int
+carry_level_invariant(carry_level * level, carry_queue_state state)
+{
+	carry_node *node;
+	carry_node *tmp_node;
+
+	if (level == NULL)
+		return 0;
+
+	if (level->track_type != 0 &&
+	    level->track_type != CARRY_TRACK_NODE &&
+	    level->track_type != CARRY_TRACK_CHANGE)
+		return 0;
+
+	/* check that nodes are in ascending order */
+	for_all_nodes(level, node, tmp_node) {
+		znode *left;
+		znode *right;
+
+		reiser4_key lkey;
+		reiser4_key rkey;
+
+		if (node != carry_node_front(level)) {
+			if (state == CARRY_TODO) {
+				right = node->node;
+				left = carry_node_prev(node)->node;
+			} else {
+				right = carry_real(node);
+				left = carry_real(carry_node_prev(node));
+			}
+			if (right == NULL || left == NULL)
+				continue;
+			if (node_is_empty(right) || node_is_empty(left))
+				continue;
+			if (!keyle(leftmost_key_in_node(left, &lkey),
+				   leftmost_key_in_node(right, &rkey))) {
+				print_znode("left", left);
+				print_node_content("left", left, ~0);
+				print_znode("right", right);
+				print_node_content("right", right, ~0);
+				return 0;
+			}
+		}
+	}
+	return 1;
+}
+#endif
+
+#if REISER4_DEBUG_OUTPUT
+/* get symbolic name for boolean */
+static const char *
+tf(int boolean /* truth value */ )
+{
+	return boolean ? "t" : "f";
+}
+
+/* symbolic name for carry operation */
+static const char *
+carry_op_name(carry_opcode op /* carry opcode */ )
+{
+	switch (op) {
+	case COP_INSERT:
+		return "COP_INSERT";
+	case COP_DELETE:
+		return "COP_DELETE";
+	case COP_CUT:
+		return "COP_CUT";
+	case COP_PASTE:
+		return "COP_PASTE";
+	case COP_UPDATE:
+		return "COP_UPDATE";
+	case COP_EXTENT:
+		return "COP_EXTENT";
+	case COP_INSERT_FLOW:
+		return "COP_INSERT_FLOW";
+	default:{
+			/* not mt safe, but who cares? */
+			static char buf[20];
+
+			sprintf(buf, "unknown op: %x", op);
+			return buf;
+		}
+	}
+}
+
+/* dump information about carry node */
+reiser4_internal void
+print_carry(const char *prefix /* prefix to print */ ,
+	    carry_node * node /* node to print */ )
+{
+	if (node == NULL) {
+		printk("%s: null\n", prefix);
+		return;
+	}
+	printk("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
+	       prefix, node, tf(node->parent), tf(node->left), tf(node->unlock), tf(node->free), tf(node->deallocate));
+	print_znode("\tnode", node->node);
+	print_znode("\treal_node", carry_real(node));
+}
+
+/* dump information about carry operation */
+reiser4_internal void
+print_op(const char *prefix /* prefix to print */ ,
+	 carry_op * op /* operation to print */ )
+{
+	if (op == NULL) {
+		printk("%s: null\n", prefix);
+		return;
+	}
+	printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
+	print_carry("\tnode", op->node);
+	switch (op->op) {
+	case COP_INSERT:
+	case COP_PASTE:
+		print_coord("\tcoord", op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
+		print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL);
+		print_carry("\tchild", op->u.insert.child);
+		break;
+	case COP_DELETE:
+		print_carry("\tchild", op->u.delete.child);
+		break;
+	case COP_CUT:
+		if (op->u.cut_or_kill.is_cut) {
+			print_coord("\tfrom", op->u.cut_or_kill.u.kill->params.from, 0);
+			print_coord("\tto", op->u.cut_or_kill.u.kill->params.to, 0);
+		} else {
+			print_coord("\tfrom", op->u.cut_or_kill.u.cut->params.from, 0);
+			print_coord("\tto", op->u.cut_or_kill.u.cut->params.to, 0);
+		}
+		break;
+	case COP_UPDATE:
+		print_carry("\tleft", op->u.update.left);
+		break;
+	default:
+		/* do nothing */
+		break;
+	}
+}
+
+/* dump information about all nodes and operations in a @level */
+reiser4_internal void
+print_level(const char *prefix /* prefix to print */ ,
+	    carry_level * level /* level to print */ )
+{
+	carry_node *node;
+	carry_node *tmp_node;
+	carry_op *op;
+	carry_op *tmp_op;
+
+	if (level == NULL) {
+		printk("%s: null\n", prefix);
+		return;
+	}
+	printk("%s: %p, restartable: %s\n",
+	       prefix, level, tf(level->restartable));
+
+	for_all_nodes(level, node, tmp_node)
+	    print_carry("\tcarry node", node);
+	for_all_ops(level, op, tmp_op)
+	    print_op("\tcarry op", op);
+}
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/carry.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/carry.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/carry.h	2004-08-22 19:35:33.613653972 +1000
@@ -0,0 +1,439 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Functions and data types to "carry" tree modification(s) upward.
+   See fs/reiser4/carry.c for details. */
+
+#if !defined( __FS_REISER4_CARRY_H__ )
+#define __FS_REISER4_CARRY_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "pool.h"
+#include "znode.h"
+
+#include <linux/types.h>
+
+/* &carry_node - "location" of carry node.
+
+   "location" of node that is involved or going to be involved into
+   carry process. Node where operation will be carried to on the
+   parent level cannot be recorded explicitly. Operation will be carried
+   usually to the parent of some node (where changes are performed at
+   the current level) or, to the left neighbor of its parent. But while
+   modifications are performed at the current level, parent may
+   change. So, we have to allow some indirection (or, positevly,
+   flexibility) in locating carry nodes.
+
+*/
+typedef struct carry_node {
+	/* pool linkage */
+	reiser4_pool_header header;
+
+	/* base node from which real_node is calculated. See
+	    fs/reiser4/carry.c:lock_carry_node(). */
+	znode *node;
+
+	/* how to get ->real_node */
+	/* to get ->real_node obtain parent of ->node*/
+	__u32 parent:1;
+	/* to get ->real_node obtain left neighbor of parent of
+	    ->node*/
+	__u32 left:1;
+	__u32 left_before:1;
+
+	/* locking */
+
+	/* this node was locked by carry process and should be
+	    unlocked when carry leaves a level */
+	__u32 unlock:1;
+
+	/* disk block for this node was allocated by carry process and
+	    should be deallocated when carry leaves a level */
+	__u32 deallocate:1;
+	/* this carry node was allocated by carry process and should be
+	    freed when carry leaves a level */
+	__u32 free:1;
+
+	/* type of lock we want to take on this node */
+	lock_handle lock_handle;
+} carry_node;
+
+/* &carry_opcode - elementary operations that can be carried upward
+
+   Operations that carry() can handle. This list is supposed to be
+   expanded.
+
+   Each carry operation (cop) is handled by appropriate function defined
+   in fs/reiser4/carry.c. For example COP_INSERT is handled by
+   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
+   call plugins of nodes affected by operation to modify nodes' content
+   and to gather operations to be performed on the next level.
+
+*/
+typedef enum {
+	/* insert new item into node. */
+	COP_INSERT,
+	/* delete pointer from parent node */
+	COP_DELETE,
+	/* remove part of or whole node. */
+	COP_CUT,
+	/* increase size of item. */
+	COP_PASTE,
+	/* insert extent (that is sequence of unformatted nodes). */
+	COP_EXTENT,
+	/* update delimiting key in least common ancestor of two
+	   nodes. This is performed when items are moved between two
+	   nodes.
+	*/
+	COP_UPDATE,
+	/* insert flow */
+	COP_INSERT_FLOW,
+	COP_LAST_OP,
+} carry_opcode;
+
+#define CARRY_FLOW_NEW_NODES_LIMIT 20
+
+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
+   item is determined. */
+typedef enum {
+	/* target item is one containing pointer to the ->child node */
+	COPT_CHILD,
+	/* target item is given explicitly by @coord */
+	COPT_ITEM_DATA,
+	/* target item is given by key */
+	COPT_KEY,
+	/* see insert_paste_common() for more comments on this. */
+	COPT_PASTE_RESTARTED,
+} cop_insert_pos_type;
+
+/* flags to cut and delete */
+typedef enum {
+	/* don't kill node even if it became completely empty as results of
+	 * cut. This is needed for eottl handling. See carry_extent() for
+	 * details. */
+	DELETE_RETAIN_EMPTY = (1 << 0)
+} cop_delete_flag;
+
+/*
+ * carry() implements "lock handle tracking" feature.
+ *
+ * Callers supply carry with node where to perform initial operation and lock
+ * handle on this node. Trying to optimize node utilization carry may actually
+ * move insertion point to different node. Callers expect that lock handle
+ * will rebe transferred to the new node also.
+ *
+ */
+typedef enum {
+	/* transfer lock handle along with insertion point */
+	CARRY_TRACK_CHANGE = 1,
+	/* acquire new lock handle to the node where insertion point is. This
+	 * is used when carry() client doesn't initially possess lock handle
+	 * on the insertion point node, for example, by extent insertion
+	 * code. See carry_extent(). */
+	CARRY_TRACK_NODE   = 2
+} carry_track_type;
+
+/* data supplied to COP_{INSERT|PASTE} by callers */
+typedef struct carry_insert_data {
+	/* position where new item is to be inserted */
+	coord_t *coord;
+	/* new item description */
+	reiser4_item_data *data;
+	/* key of new item */
+	const reiser4_key *key;
+} carry_insert_data;
+
+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
+struct cut_kill_params {
+	/* coord where cut starts (inclusive) */
+	coord_t *from;
+	/* coord where cut stops (inclusive, this item/unit will also be
+	 * cut) */
+	coord_t *to;
+	/* starting key. This is necessary when item and unit pos don't
+	 * uniquely identify what portion or tree to remove. For example, this
+	 * indicates what portion of extent unit will be affected. */
+	const reiser4_key *from_key;
+	/* exclusive stop key */
+	const reiser4_key *to_key;
+	/* if this is not NULL, smallest actually removed key is stored
+	 * here. */
+	reiser4_key *smallest_removed;
+};
+
+struct carry_cut_data {
+	struct cut_kill_params params;
+};
+
+struct carry_kill_data {
+	struct cut_kill_params params;
+	/* parameter to be passed to the ->kill_hook() method of item
+	 * plugin */
+	/*void *iplug_params;*/ /* FIXME: unused currently */
+	/* if not NULL---inode whose items are being removed. This is needed
+	 * for ->kill_hook() of extent item to update VM structures when
+	 * removing pages. */
+	struct inode *inode;
+	/* sibling list maintenance is complicated by existence of eottl. When
+	 * eottl whose left and right neighbors are formatted leaves is
+	 * removed, one has to connect said leaves in the sibling list. This
+	 * cannot be done when extent removal is just started as locking rules
+	 * require sibling list update to happen atomically with removal of
+	 * extent item. Therefore: 1. pointers to left and right neighbors
+	 * have to be passed down to the ->kill_hook() of extent item, and
+	 * 2. said neighbors have to be locked. */
+	lock_handle *left;
+	lock_handle *right;
+	/* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
+	unsigned flags;
+};
+
+/* &carry_tree_op - operation to "carry" upward.
+
+   Description of an operation we want to "carry" to the upper level of
+   a tree: e.g, when we insert something and there is not enough space
+   we allocate a new node and "carry" the operation of inserting a
+   pointer to the new node to the upper level, on removal of empty node,
+   we carry up operation of removing appropriate entry from parent.
+
+   There are two types of carry ops: when adding or deleting node we
+   node at the parent level where appropriate modification has to be
+   performed is known in advance. When shifting items between nodes
+   (split, merge), delimiting key should be changed in the least common
+   parent of the nodes involved that is not known in advance.
+
+   For the operations of the first type we store in &carry_op pointer to
+   the &carry_node at the parent level. For the operation of the second
+   type we store &carry_node or parents of the left and right nodes
+   modified and keep track of them upward until they coincide.
+
+*/
+typedef struct carry_op {
+	/* pool linkage */
+	reiser4_pool_header header;
+	carry_opcode op;
+	/* node on which operation is to be performed:
+
+	   for insert, paste: node where new item is to be inserted
+
+	   for delete: node where pointer is to be deleted
+
+	   for cut: node to cut from
+
+	   for update: node where delimiting key is to be modified
+
+	   for modify: parent of modified node
+
+	*/
+	carry_node *node;
+	union {
+		struct {
+			/* (sub-)type of insertion/paste. Taken from
+			   cop_insert_pos_type. */
+			__u8 type;
+			/* various operation flags. Taken from
+			   cop_insert_flag. */
+			__u8 flags;
+			carry_insert_data *d;
+			carry_node *child;
+			znode *brother;
+		} insert, paste, extent;
+
+		struct {
+			int is_cut;
+			union {
+				carry_kill_data *kill;
+				carry_cut_data *cut;
+			} u;
+		} cut_or_kill;
+
+		struct {
+			carry_node *left;
+		} update;
+		struct {
+			/* changed child */
+			carry_node *child;
+			/* bitmask of changes. See &cop_modify_flag */
+			__u32 flag;
+		} modify;
+		struct {
+			/* flags to deletion operation. Are taken from
+			   cop_delete_flag */
+			__u32 flags;
+			/* child to delete from parent. If this is
+			   NULL, delete op->node.  */
+			carry_node *child;
+		} delete;
+		struct {
+			/* various operation flags. Taken from
+			   cop_insert_flag. */
+			__u32 flags;
+			flow_t *flow;
+			coord_t *insert_point;
+			reiser4_item_data *data;
+			/* flow insertion is limited by number of new blocks
+			   added in that operation which do not get any data
+			   but part of flow. This limit is set by macro
+			   CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
+			   of nodes added already during one carry_flow */
+			int new_nodes;
+		} insert_flow;
+	} u;
+} carry_op;
+
+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
+typedef struct carry_pool {
+	carry_op op[CARRIES_POOL_SIZE];
+	reiser4_pool op_pool;
+	carry_node node[NODES_LOCKED_POOL_SIZE];
+	reiser4_pool node_pool;
+} carry_pool;
+
+/* &carry_tree_level - carry process on given level
+
+   Description of balancing process on the given level.
+
+   No need for locking here, as carry_tree_level is essentially per
+   thread thing (for now).
+
+*/
+struct carry_level {
+	/* this level may be restarted */
+	__u32 restartable:1;
+	/* list of carry nodes on this level, ordered by key order */
+	pool_level_list_head nodes;
+	pool_level_list_head ops;
+	/* pool where new objects are allocated from */
+	carry_pool *pool;
+	int ops_num;
+	int nodes_num;
+	/* new root created on this level, if any */
+	znode *new_root;
+	/* This is set by caller (insert_by_key(), resize_item(), etc.) when
+	   they want ->tracked to automagically wander to the node where
+	   insertion point moved after insert or paste.
+	*/
+	carry_track_type track_type;
+	/* lock handle supplied by user that we are tracking. See
+	   above. */
+	lock_handle *tracked;
+#if REISER4_STATS
+	tree_level level_no;
+#endif
+};
+
+/* information carry passes to plugin methods that may add new operations to
+   the @todo queue  */
+struct carry_plugin_info {
+	carry_level *doing;
+	carry_level *todo;
+};
+
+int carry(carry_level * doing, carry_level * done);
+
+carry_node *add_carry(carry_level * level, pool_ordering order, carry_node * reference);
+carry_node *add_carry_skip(carry_level * level, pool_ordering order, carry_node * reference);
+carry_op *add_op(carry_level * level, pool_ordering order, carry_op * reference);
+
+extern carry_node *insert_carry_node(carry_level * doing,
+				     carry_level * todo, const znode * node);
+
+extern carry_node *add_carry_atplace(carry_level *doing,
+				     carry_level *todo, znode *node);
+
+extern carry_node *find_begetting_brother(carry_node * node, carry_level * kin);
+
+extern void init_carry_pool(carry_pool * pool);
+extern void done_carry_pool(carry_pool * pool);
+
+extern void init_carry_level(carry_level * level, carry_pool * pool);
+
+extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node, int apply_to_parent);
+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op, znode * node, int apply_to_parent_p);
+
+extern int carry_op_num(const carry_level * level);
+
+carry_node *add_new_znode(znode * brother, carry_node * reference, carry_level * doing, carry_level * todo);
+
+carry_node *find_carry_node(carry_level * level, const znode * node);
+
+extern znode *carry_real(const carry_node * node);
+
+/* helper macros to iterate over carry queues */
+
+#define carry_node_next( node ) 					\
+	( ( carry_node * ) pool_level_list_next( &( node ) -> header ) )
+
+#define carry_node_prev( node ) 					\
+	( ( carry_node * ) pool_level_list_prev( &( node ) -> header ) )
+
+#define carry_node_front( level )					\
+	( ( carry_node * ) pool_level_list_front( &( level ) -> nodes ) )
+
+#define carry_node_back( level )					\
+	( ( carry_node * ) pool_level_list_back( &( level ) -> nodes ) )
+
+#define carry_node_end( level, node ) 					\
+	( pool_level_list_end( &( level ) -> nodes, &( node ) -> header ) )
+
+/* macro to iterate over all operations in a @level */
+#define for_all_ops( level /* carry level (of type carry_level *) */, 		\
+		     op    /* pointer to carry operation, modified by loop (of	\
+			    * type carry_op *) */, 				\
+		     tmp   /* pointer to carry operation (of type carry_op *),	\
+			    * used to make iterator stable in the face of	\
+			    * deletions from the level */ )			\
+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ),		\
+     tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ;		\
+     ! pool_level_list_end( &level -> ops, &op -> header ) ;			\
+     op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
+
+/* macro to iterate over all nodes in a @level */
+#define for_all_nodes( level /* carry level (of type carry_level *) */,		\
+		       node  /* pointer to carry node, modified by loop (of	\
+			      * type carry_node *) */,				\
+		       tmp   /* pointer to carry node (of type carry_node *),	\
+			      * used to make iterator stable in the face of *	\
+			      * deletions from the level */ )			\
+for( node = carry_node_front( level ),						\
+     tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ;		\
+     node = tmp, tmp = carry_node_next( node ) )
+
+/* macro to iterate over all nodes in a @level in reverse order
+
+   This is used, because nodes are unlocked in reversed order of locking */
+#define for_all_nodes_back( level /* carry level (of type carry_level *) */,	\
+		            node  /* pointer to carry node, modified by loop	\
+				   * (of type carry_node *) */,			\
+		            tmp   /* pointer to carry node (of type carry_node	\
+				   * *), used to make iterator stable in the	\
+				   * face of deletions from the level */ )	\
+for( node = carry_node_back( level ),		\
+     tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ;		\
+     node = tmp, tmp = carry_node_prev( node ) )
+
+/* debugging function */
+
+#if REISER4_DEBUG_OUTPUT
+extern void print_carry(const char *prefix, carry_node * node);
+extern void print_op(const char *prefix, carry_op * op);
+extern void print_level(const char *prefix, carry_level * level);
+#else
+#define print_carry( p, n ) noop
+#define print_op( p, o ) noop
+#define print_level( p, l ) noop
+#endif
+
+/* __FS_REISER4_CARRY_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/carry_ops.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/carry_ops.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/carry_ops.c	2004-08-22 19:35:33.616653494 +1000
@@ -0,0 +1,2171 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* implementation of carry operations */
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "pool.h"
+#include "tree_mod.h"
+#include "carry.h"
+#include "carry_ops.h"
+#include "tree.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
+			    carry_level * doing, carry_level * todo, unsigned int including_insert_coord_p);
+
+extern int lock_carry_node(carry_level * level, carry_node * node);
+extern int lock_carry_node_tail(carry_node * node);
+
+/* find left neighbor of a carry node
+
+   Look for left neighbor of @node and add it to the @doing queue. See
+   comments in the body.
+
+*/
+static carry_node *
+find_left_neighbor(carry_op * op	/* node to find left
+					 * neighbor of */ ,
+		   carry_level * doing /* level to scan */ )
+{
+	int result;
+	carry_node *node;
+	carry_node *left;
+	int flags;
+	reiser4_tree *tree;
+
+	node = op->node;
+
+	tree = current_tree;
+	RLOCK_TREE(tree);
+	/* first, check whether left neighbor is already in a @doing queue */
+	if (carry_real(node)->left != NULL) {
+		/* NOTE: there is locking subtlety here. Look into
+		 * find_right_neighbor() for more info */
+		if (find_carry_node(doing, carry_real(node)->left) != NULL) {
+			RUNLOCK_TREE(tree);
+			left = node;
+			do {
+				left = carry_node_prev(left);
+				assert("nikita-3408", !carry_node_end(doing,
+								      left));
+			} while (carry_real(left) == carry_real(node));
+			reiser4_stat_level_inc(doing, carry_left_in_carry);
+			return left;
+		}
+	}
+	RUNLOCK_TREE(tree);
+
+	left = add_carry_skip(doing, POOLO_BEFORE, node);
+	if (IS_ERR(left))
+		return left;
+
+	left->node = node->node;
+	left->free = 1;
+
+	flags = GN_TRY_LOCK;
+	if (!op->u.insert.flags & COPI_LOAD_LEFT)
+		flags |= GN_NO_ALLOC;
+
+	/* then, feeling lucky, peek left neighbor in the cache. */
+	result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node),
+					   ZNODE_WRITE_LOCK, flags);
+	if (result == 0) {
+		/* ok, node found and locked. */
+		result = lock_carry_node_tail(left);
+		if (result != 0)
+			left = ERR_PTR(result);
+		reiser4_stat_level_inc(doing, carry_left_in_cache);
+	} else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
+		/* node is leftmost node in a tree, or neighbor wasn't in
+		   cache, or there is an extent on the left. */
+		if (REISER4_STATS && (result == -ENOENT))
+			reiser4_stat_level_inc(doing, carry_left_missed);
+		if (REISER4_STATS && (result == -E_NO_NEIGHBOR))
+			reiser4_stat_level_inc(doing, carry_left_not_avail);
+		reiser4_pool_free(&doing->pool->node_pool, &left->header);
+		left = NULL;
+	} else if (doing->restartable) {
+		/* if left neighbor is locked, and level is restartable, add
+		   new node to @doing and restart. */
+		assert("nikita-913", node->parent != 0);
+		assert("nikita-914", node->node != NULL);
+		left->left = 1;
+		left->free = 0;
+		left = ERR_PTR(-E_REPEAT);
+	} else {
+		/* left neighbor is locked, level cannot be restarted. Just
+		   ignore left neighbor. */
+		reiser4_pool_free(&doing->pool->node_pool, &left->header);
+		left = NULL;
+		reiser4_stat_level_inc(doing, carry_left_refuse);
+	}
+	return left;
+}
+
+/* find right neighbor of a carry node
+
+   Look for right neighbor of @node and add it to the @doing queue. See
+   comments in the body.
+
+*/
+static carry_node *
+find_right_neighbor(carry_op * op	/* node to find right
+					 * neighbor of */ ,
+		    carry_level * doing /* level to scan */ )
+{
+	int result;
+	carry_node *node;
+	carry_node *right;
+	lock_handle lh;
+	int flags;
+	reiser4_tree *tree;
+
+	init_lh(&lh);
+
+	node = op->node;
+
+	tree = current_tree;
+	RLOCK_TREE(tree);
+	/* first, check whether right neighbor is already in a @doing queue */
+	if (carry_real(node)->right != NULL) {
+		/*
+		 * Tree lock is taken here anyway, because, even if _outcome_
+		 * of (find_carry_node() != NULL) doesn't depends on
+		 * concurrent updates to ->right, find_carry_node() cannot
+		 * work with second argument NULL. Hence, following comment is
+		 * of historic importance only.
+		 *
+		 * Subtle:
+		 *
+		 * Q: why don't we need tree lock here, looking for the right
+		 * neighbor?
+		 *
+		 * A: even if value of node->real_node->right were changed
+		 * during find_carry_node() execution, outcome of execution
+		 * wouldn't change, because (in short) other thread cannot add
+		 * elements to the @doing, and if node->real_node->right
+		 * already was in @doing, value of node->real_node->right
+		 * couldn't change, because node cannot be inserted between
+		 * locked neighbors.
+		 */
+		if (find_carry_node(doing, carry_real(node)->right) != NULL) {
+			RUNLOCK_TREE(tree);
+			/*
+			 * What we are doing here (this is also applicable to
+			 * the find_left_neighbor()).
+			 *
+			 * tree_walk.c code requires that insertion of a
+			 * pointer to a child, modification of parent pointer
+			 * in the child, and insertion of the child into
+			 * sibling list are atomic (see
+			 * plugin/item/internal.c:create_hook_internal()).
+			 *
+			 * carry allocates new node long before pointer to it
+			 * is inserted into parent and, actually, long before
+			 * parent is even known. Such allocated-but-orphaned
+			 * nodes are only trackable through carry level lists.
+			 *
+			 * Situation that is handled here is following: @node
+			 * has valid ->right pointer, but there is
+			 * allocated-but-orphaned node in the carry queue that
+			 * is logically between @node and @node->right. Here
+			 * we are searching for it. Critical point is that
+			 * this is only possible if @node->right is also in
+			 * the carry queue (this is checked above), because
+			 * this is the only way new orphaned node could be
+			 * inserted between them (before inserting new node,
+			 * make_space() first tries to shift to the right, so,
+			 * right neighbor will be locked and queued).
+			 *
+			 */
+			right = node;
+			do {
+				right = carry_node_next(right);
+				assert("nikita-3408", !carry_node_end(doing,
+								      right));
+			} while (carry_real(right) == carry_real(node));
+			reiser4_stat_level_inc(doing, carry_right_in_carry);
+			return right;
+		}
+	}
+	RUNLOCK_TREE(tree);
+
+	flags = GN_CAN_USE_UPPER_LEVELS;
+	if (!op->u.insert.flags & COPI_LOAD_RIGHT)
+		flags = GN_NO_ALLOC;
+
+	/* then, try to lock right neighbor */
+	init_lh(&lh);
+	result = reiser4_get_right_neighbor(&lh, carry_real(node),
+					    ZNODE_WRITE_LOCK, flags);
+	if (result == 0) {
+		/* ok, node found and locked. */
+		reiser4_stat_level_inc(doing, carry_right_in_cache);
+		right = add_carry_skip(doing, POOLO_AFTER, node);
+		if (!IS_ERR(right)) {
+			right->node = lh.node;
+			move_lh(&right->lock_handle, &lh);
+			right->free = 1;
+			result = lock_carry_node_tail(right);
+			if (result != 0)
+				right = ERR_PTR(result);
+		}
+	} else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
+		/* node is rightmost node in a tree, or neighbor wasn't in
+		   cache, or there is an extent on the right. */
+		right = NULL;
+		if (REISER4_STATS && (result == -ENOENT))
+			reiser4_stat_level_inc(doing, carry_right_missed);
+		if (REISER4_STATS && (result == -E_NO_NEIGHBOR))
+			reiser4_stat_level_inc(doing, carry_right_not_avail);
+	} else
+		right = ERR_PTR(result);
+	done_lh(&lh);
+	return right;
+}
+
+/* how much free space in a @node is needed for @op
+
+   How much space in @node is required for completion of @op, where @op is
+   insert or paste operation.
+*/
+static unsigned int
+space_needed_for_op(znode * node	/* znode data are
+					 * inserted or
+					 * pasted in */ ,
+		    carry_op * op	/* carry
+					   operation */ )
+{
+	assert("nikita-919", op != NULL);
+
+	switch (op->op) {
+	default:
+		impossible("nikita-1701", "Wrong opcode");
+	case COP_INSERT:
+		return space_needed(node, NULL, op->u.insert.d->data, 1);
+	case COP_PASTE:
+		return space_needed(node, op->u.insert.d->coord, op->u.insert.d->data, 0);
+	}
+}
+
+/* how much space in @node is required to insert or paste @data at
+   @coord. */
+reiser4_internal unsigned int
+space_needed(const znode * node	/* node data are inserted or
+				 * pasted in */ ,
+	     const coord_t * coord	/* coord where data are
+					   * inserted or pasted
+					   * at */ ,
+	     const reiser4_item_data * data	/* data to insert or
+						 * paste */ ,
+	     int insertion /* non-0 is inserting, 0---paste */ )
+{
+	int result;
+	item_plugin *iplug;
+
+	assert("nikita-917", node != NULL);
+	assert("nikita-918", node_plugin_by_node(node) != NULL);
+	assert("vs-230", !insertion || (coord == NULL));
+
+	result = 0;
+	iplug = data->iplug;
+	if (iplug->b.estimate != NULL) {
+		/* ask item plugin how much space is needed to insert this
+		   item */
+		result += iplug->b.estimate(insertion ? NULL : coord, data);
+	} else {
+		/* reasonable default */
+		result += data->length;
+	}
+	if (insertion) {
+		node_plugin *nplug;
+
+		nplug = node->nplug;
+		/* and add node overhead */
+		if (nplug->item_overhead != NULL) {
+			result += nplug->item_overhead(node, 0);
+		}
+	}
+	return result;
+}
+
+/* find &coord in parent where pointer to new child is to be stored. */
+static int
+find_new_child_coord(carry_op * op	/* COP_INSERT carry operation to
+					 * insert pointer to new
+					 * child */ )
+{
+	int result;
+	znode *node;
+	znode *child;
+
+	assert("nikita-941", op != NULL);
+	assert("nikita-942", op->op == COP_INSERT);
+
+	trace_stamp(TRACE_CARRY);
+
+	node = carry_real(op->node);
+	assert("nikita-943", node != NULL);
+	assert("nikita-944", node_plugin_by_node(node) != NULL);
+
+	child = carry_real(op->u.insert.child);
+	result = find_new_child_ptr(node, child, op->u.insert.brother, op->u.insert.d->coord);
+
+	build_child_ptr_data(child, op->u.insert.d->data);
+	return result;
+}
+
+/* additional amount of free space in @node required to complete @op */
+static int
+free_space_shortage(znode * node /* node to check */ ,
+		    carry_op * op /* operation being performed */ )
+{
+	assert("nikita-1061", node != NULL);
+	assert("nikita-1062", op != NULL);
+
+	switch (op->op) {
+	default:
+		impossible("nikita-1702", "Wrong opcode");
+	case COP_INSERT:
+	case COP_PASTE:
+		return space_needed_for_op(node, op) - znode_free_space(node);
+	case COP_EXTENT:
+		/* when inserting extent shift data around until insertion
+		   point is utmost in the node. */
+		if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
+			return +1;
+		else
+			return -1;
+	}
+}
+
+/* helper function: update node pointer in operation after insertion
+   point was probably shifted into @target. */
+static znode *
+sync_op(carry_op * op, carry_node * target)
+{
+	znode *insertion_node;
+
+	/* reget node from coord: shift might move insertion coord to
+	   the neighbor */
+	insertion_node = op->u.insert.d->coord->node;
+	/* if insertion point was actually moved into new node,
+	   update carry node pointer in operation. */
+	if (insertion_node != carry_real(op->node)) {
+		op->node = target;
+		assert("nikita-2540", carry_real(target) == insertion_node);
+	}
+	assert("nikita-2541",
+	       carry_real(op->node) == op->u.insert.d->coord->node);
+	return insertion_node;
+}
+
+/*
+ * complete make_space() call: update tracked lock handle if necessary. See
+ * comments for fs/reiser4/carry.h:carry_track_type
+ */
+static int
+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
+{
+	int result;
+	carry_track_type tracking;
+	znode *node;
+
+	tracking = doing->track_type;
+	node = op->u.insert.d->coord->node;
+
+	if (tracking == CARRY_TRACK_NODE ||
+	    (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
+		/* inserting or pasting into node different from
+		   original. Update lock handle supplied by caller. */
+		assert("nikita-1417", doing->tracked != NULL);
+		done_lh(doing->tracked);
+		init_lh(doing->tracked);
+		result = longterm_lock_znode(doing->tracked, node,
+					     ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
+		reiser4_stat_level_inc(doing, track_lh);
+		ON_TRACE(TRACE_CARRY, "tracking: %i: %p -> %p\n",
+			 tracking, orig_node, node);
+	} else
+		result = 0;
+	return result;
+}
+
+/* This is insertion policy function. It shifts data to the left and right
+   neighbors of insertion coord and allocates new nodes until there is enough
+   free space to complete @op.
+
+   See comments in the body.
+
+   Assumes that the node format favors insertions at the right end of the node
+   as node40 does.
+
+   See carry_flow() on detail about flow insertion
+*/
+static int
+make_space(carry_op * op /* carry operation, insert or paste */ ,
+	   carry_level * doing /* current carry queue */ ,
+	   carry_level * todo /* carry queue on the parent level */ )
+{
+	znode *node;
+	int result;
+	int not_enough_space;
+	int blk_alloc;
+	znode *orig_node;
+	__u32 flags;
+
+	coord_t *coord;
+
+	assert("nikita-890", op != NULL);
+	assert("nikita-891", todo != NULL);
+	assert("nikita-892",
+	       op->op == COP_INSERT ||
+	       op->op == COP_PASTE || op->op == COP_EXTENT);
+	assert("nikita-1607",
+	       carry_real(op->node) == op->u.insert.d->coord->node);
+
+	trace_stamp(TRACE_CARRY);
+
+	flags = op->u.insert.flags;
+
+	/* NOTE check that new node can only be allocated after checking left
+	 * and right neighbors. This is necessary for proper work of
+	 * find_{left,right}_neighbor(). */
+	assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
+				   flags & COPI_DONT_SHIFT_LEFT));
+	assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
+				   flags & COPI_DONT_SHIFT_RIGHT));
+
+	coord = op->u.insert.d->coord;
+	orig_node = node = coord->node;
+
+	assert("nikita-908", node != NULL);
+	assert("nikita-909", node_plugin_by_node(node) != NULL);
+
+	result = 0;
+	/* If there is not enough space in a node, try to shift something to
+	   the left neighbor. This is a bit tricky, as locking to the left is
+	   low priority. This is handled by restart logic in carry().
+	*/
+	not_enough_space = free_space_shortage(node, op);
+	if (not_enough_space <= 0)
+		/* it is possible that carry was called when there actually
+		   was enough space in the node. For example, when inserting
+		   leftmost item so that delimiting keys have to be updated.
+		*/
+		return make_space_tail(op, doing, orig_node);
+	if (!(flags & COPI_DONT_SHIFT_LEFT)) {
+		carry_node *left;
+		/* make note in statistics of an attempt to move
+		   something into the left neighbor */
+		reiser4_stat_level_inc(doing, insert_looking_left);
+		left = find_left_neighbor(op, doing);
+		if (unlikely(IS_ERR(left))) {
+			if (PTR_ERR(left) == -E_REPEAT)
+				return -E_REPEAT;
+			else {
+				/* some error other than restart request
+				   occurred. This shouldn't happen. Issue a
+				   warning and continue as if left neighbor
+				   weren't existing.
+				*/
+				warning("nikita-924",
+					"Error accessing left neighbor: %li",
+					PTR_ERR(left));
+				print_znode("node", node);
+			}
+		} else if (left != NULL) {
+
+			/* shift everything possible on the left of and
+			   including insertion coord into the left neighbor */
+			result = carry_shift_data(LEFT_SIDE, coord,
+						  carry_real(left), doing, todo,
+						  flags & COPI_GO_LEFT);
+
+			/* reget node from coord: shift_left() might move
+			   insertion coord to the left neighbor */
+			node = sync_op(op, left);
+
+			not_enough_space = free_space_shortage(node, op);
+			/* There is not enough free space in @node, but
+			   may be, there is enough free space in
+			   @left. Various balancing decisions are valid here.
+			   The same for the shifiting to the right.
+			*/
+		}
+	}
+	/* If there still is not enough space, shift to the right */
+	if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
+		carry_node *right;
+
+		reiser4_stat_level_inc(doing, insert_looking_right);
+		right = find_right_neighbor(op, doing);
+		if (IS_ERR(right)) {
+			warning("nikita-1065",
+				"Error accessing right neighbor: %li",
+				PTR_ERR(right));
+			print_znode("node", node);
+		} else if (right != NULL) {
+			/* node containing insertion point, and its right
+			   neighbor node are write locked by now.
+
+			   shift everything possible on the right of but
+			   excluding insertion coord into the right neighbor
+			*/
+			result = carry_shift_data(RIGHT_SIDE, coord,
+						  carry_real(right),
+						  doing, todo,
+						  flags & COPI_GO_RIGHT);
+			/* reget node from coord: shift_right() might move
+			   insertion coord to the right neighbor */
+			node = sync_op(op, right);
+			not_enough_space = free_space_shortage(node, op);
+		}
+	}
+	/* If there is still not enough space, allocate new node(s).
+
+	   We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
+	   the carry operation flags (currently this is needed during flush
+	   only).
+	*/
+	for (blk_alloc = 0;
+	     not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
+		     !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
+		carry_node *fresh;	/* new node we are allocating */
+		coord_t coord_shadow;	/* remembered insertion point before
+					 * shifting data into new node */
+		carry_node *node_shadow;	/* remembered insertion node before
+						 * shifting */
+		unsigned int gointo;	/* whether insertion point should move
+					 * into newly allocated node */
+
+		reiser4_stat_level_inc(doing, insert_alloc_new);
+		if (blk_alloc > 0)
+			reiser4_stat_level_inc(doing, insert_alloc_many);
+
+		/* allocate new node on the right of @node. Znode and disk
+		   fake block number for new node are allocated.
+
+		   add_new_znode() posts carry operation COP_INSERT with
+		   COPT_CHILD option to the parent level to add
+		   pointer to newly created node to its parent.
+
+		   Subtle point: if several new nodes are required to complete
+		   insertion operation at this level, they will be inserted
+		   into their parents in the order of creation, which means
+		   that @node will be valid "cookie" at the time of insertion.
+
+		*/
+		fresh = add_new_znode(node, op->node, doing, todo);
+		if (IS_ERR(fresh))
+			return PTR_ERR(fresh);
+
+		/* Try to shift into new node. */
+		result = lock_carry_node(doing, fresh);
+		zput(carry_real(fresh));
+		if (result != 0) {
+			warning("nikita-947",
+				"Cannot lock new node: %i", result);
+			print_znode("new", carry_real(fresh));
+			print_znode("node", node);
+			return result;
+		}
+
+		/* both nodes are write locked by now.
+
+		   shift everything possible on the right of and
+		   including insertion coord into the right neighbor.
+		*/
+		coord_dup(&coord_shadow, op->u.insert.d->coord);
+		node_shadow = op->node;
+		/* move insertion point into newly created node if:
+
+		    . insertion point is rightmost in the source node, or
+		    . this is not the first node we are allocating in a row.
+		*/
+		gointo =
+			(blk_alloc > 0) ||
+			coord_is_after_rightmost(op->u.insert.d->coord);
+
+		result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh),
+					  doing, todo, gointo);
+		/* if insertion point was actually moved into new node,
+		   update carry node pointer in operation. */
+		node = sync_op(op, fresh);
+		not_enough_space = free_space_shortage(node, op);
+		if ((not_enough_space > 0) && (node != coord_shadow.node)) {
+			/* there is not enough free in new node. Shift
+			   insertion point back to the @shadow_node so that
+			   next new node would be inserted between
+			   @shadow_node and @fresh.
+			*/
+			coord_normalize(&coord_shadow);
+			coord_dup(coord, &coord_shadow);
+			node = coord->node;
+			op->node = node_shadow;
+			if (1 || (flags & COPI_STEP_BACK)) {
+				/* still not enough space?! Maybe there is
+				   enough space in the source node (i.e., node
+				   data are moved from) now.
+				*/
+				not_enough_space = free_space_shortage(node, op);
+			}
+		}
+	}
+	if (not_enough_space > 0) {
+		if (!(flags & COPI_DONT_ALLOCATE))
+			warning("nikita-948", "Cannot insert new item");
+		result = -E_NODE_FULL;
+	}
+	assert("nikita-1622", ergo(result == 0,
+				   carry_real(op->node) == coord->node));
+	assert("nikita-2616", coord == op->u.insert.d->coord);
+	if (result == 0)
+		result = make_space_tail(op, doing, orig_node);
+	return result;
+}
+
+/* insert_paste_common() - common part of insert and paste operations
+
+   This function performs common part of COP_INSERT and COP_PASTE.
+
+   There are two ways in which insertion/paste can be requested:
+
+    . by directly supplying reiser4_item_data. In this case, op ->
+    u.insert.type is set to COPT_ITEM_DATA.
+
+    . by supplying child pointer to which is to inserted into parent. In this
+    case op -> u.insert.type == COPT_CHILD.
+
+    . by supplying key of new item/unit. This is currently only used during
+    extent insertion
+
+   This is required, because when new node is allocated we don't know at what
+   position pointer to it is to be stored in the parent. Actually, we don't
+   even know what its parent will be, because parent can be re-balanced
+   concurrently and new node re-parented, and because parent can be full and
+   pointer to the new node will go into some other node.
+
+   insert_paste_common() resolves pointer to child node into position in the
+   parent by calling find_new_child_coord(), that fills
+   reiser4_item_data. After this, insertion/paste proceeds uniformly.
+
+   Another complication is with finding free space during pasting. It may
+   happen that while shifting items to the neighbors and newly allocated
+   nodes, insertion coord can no longer be in the item we wanted to paste
+   into. At this point, paste becomes (morphs) into insert. Moreover free
+   space analysis has to be repeated, because amount of space required for
+   insertion is different from that of paste (item header overhead, etc).
+
+   This function "unifies" different insertion modes (by resolving child
+   pointer or key into insertion coord), and then calls make_space() to free
+   enough space in the node by shifting data to the left and right and by
+   allocating new nodes if necessary. Carry operation knows amount of space
+   required for its completion. After enough free space is obtained, caller of
+   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
+   by calling item plugin method.
+
+*/
+static int
+insert_paste_common(carry_op * op	/* carry operation being
+					 * performed */ ,
+		    carry_level * doing /* current carry level */ ,
+		    carry_level * todo /* next carry level */ ,
+		    carry_insert_data * cdata	/* pointer to
+						 * cdata */ ,
+		    coord_t * coord /* insertion/paste coord */ ,
+		    reiser4_item_data * data	/* data to be
+						 * inserted/pasted */ )
+{
+	assert("nikita-981", op != NULL);
+	assert("nikita-980", todo != NULL);
+	assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE) || (op->op == COP_EXTENT));
+
+	trace_stamp(TRACE_CARRY);
+
+	if (op->u.insert.type == COPT_PASTE_RESTARTED) {
+		/* nothing to do. Fall through to make_space(). */
+		;
+	} else if (op->u.insert.type == COPT_KEY) {
+		node_search_result intra_node;
+		znode *node;
+		/* Problem with doing batching at the lowest level, is that
+		   operations here are given by coords where modification is
+		   to be performed, and one modification can invalidate coords
+		   of all following operations.
+
+		   So, we are implementing yet another type for operation that
+		   will use (the only) "locator" stable across shifting of
+		   data between nodes, etc.: key (COPT_KEY).
+
+		   This clause resolves key to the coord in the node.
+
+		   But node can change also. Probably some pieces have to be
+		   added to the lock_carry_node(), to lock node by its key.
+
+		*/
+		/* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
+		   if you need something else. */
+		op->u.insert.d->coord = coord;
+		node = carry_real(op->node);
+		intra_node = node_plugin_by_node(node)->lookup
+		    (node, op->u.insert.d->key, FIND_EXACT, op->u.insert.d->coord);
+		if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
+			warning("nikita-1715", "Intra node lookup failure: %i", intra_node);
+			print_znode("node", node);
+			return intra_node;
+		}
+	} else if (op->u.insert.type == COPT_CHILD) {
+		/* if we are asked to insert pointer to the child into
+		   internal node, first convert pointer to the child into
+		   coord within parent node.
+		*/
+		znode *child;
+		int result;
+
+		op->u.insert.d = cdata;
+		op->u.insert.d->coord = coord;
+		op->u.insert.d->data = data;
+		op->u.insert.d->coord->node = carry_real(op->node);
+		result = find_new_child_coord(op);
+		child = carry_real(op->u.insert.child);
+		if (result != NS_NOT_FOUND) {
+			warning("nikita-993", "Cannot find a place for child pointer: %i", result);
+			print_znode("child", child);
+			print_znode("parent", carry_real(op->node));
+			return result;
+		}
+		/* This only happens when we did multiple insertions at
+		   the previous level, trying to insert single item and
+		   it so happened, that insertion of pointers to all new
+		   nodes before this one already caused parent node to
+		   split (may be several times).
+
+		   I am going to come up with better solution.
+
+		   You are not expected to understand this.
+		          -- v6root/usr/sys/ken/slp.c
+
+		   Basically, what happens here is the following: carry came
+		   to the parent level and is about to insert internal item
+		   pointing to the child node that it just inserted in the
+		   level below. Position where internal item is to be inserted
+		   was found by find_new_child_coord() above, but node of the
+		   current carry operation (that is, parent node of child
+		   inserted on the previous level), was determined earlier in
+		   the lock_carry_level/lock_carry_node. It could so happen
+		   that other carry operations already performed on the parent
+		   level already split parent node, so that insertion point
+		   moved into another node. Handle this by creating new carry
+		   node for insertion point if necessary.
+		*/
+		if (carry_real(op->node) != op->u.insert.d->coord->node) {
+			pool_ordering direction;
+			znode *z1;
+			znode *z2;
+			reiser4_key k1;
+			reiser4_key k2;
+
+			/*
+			 * determine in what direction insertion point
+			 * moved. Do this by comparing delimiting keys.
+			 */
+			z1 = op->u.insert.d->coord->node;
+			z2 = carry_real(op->node);
+			if (keyle(leftmost_key_in_node(z1, &k1),
+				  leftmost_key_in_node(z2, &k2)))
+				/* insertion point moved to the left */
+				direction = POOLO_BEFORE;
+			else
+				/* insertion point moved to the right */
+				direction = POOLO_AFTER;
+
+			op->node = add_carry_skip(doing, direction, op->node);
+			if (IS_ERR(op->node))
+				return PTR_ERR(op->node);
+			op->node->node = op->u.insert.d->coord->node;
+			op->node->free = 1;
+			result = lock_carry_node(doing, op->node);
+			if (result != 0)
+				return result;
+		}
+
+		/*
+		 * set up key of an item being inserted: we are inserting
+		 * internal item and its key is (by the very definition of
+		 * search tree) is leftmost key in the child node.
+		 */
+		op->u.insert.d->key = UNDER_RW(dk, znode_get_tree(child), read,
+					       leftmost_key_in_node(child, znode_get_ld_key(child)));
+		op->u.insert.d->data->arg = op->u.insert.brother;
+	} else {
+		assert("vs-243", op->u.insert.d->coord != NULL);
+		op->u.insert.d->coord->node = carry_real(op->node);
+	}
+
+	/* find free space. */
+	return make_space(op, doing, todo);
+}
+
+/* handle carry COP_INSERT operation.
+
+   Insert new item into node. New item can be given in one of two ways:
+
+   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
+   only applicable at the leaf/twig level.
+
+   - by passing a child node pointer to which is to be inserted by this
+   operation.
+
+*/
+static int
+carry_insert(carry_op * op /* operation to perform */ ,
+	     carry_level * doing	/* queue of operations @op
+					 * is part of */ ,
+	     carry_level * todo	/* queue where new operations
+				 * are accumulated */ )
+{
+	znode *node;
+	carry_insert_data cdata;
+	coord_t coord;
+	reiser4_item_data data;
+	carry_plugin_info info;
+	int result;
+
+	assert("nikita-1036", op != NULL);
+	assert("nikita-1037", todo != NULL);
+	assert("nikita-1038", op->op == COP_INSERT);
+
+	trace_stamp(TRACE_CARRY);
+	reiser4_stat_level_inc(doing, insert);
+
+	coord_init_zero(&coord);
+
+	/* perform common functionality of insert and paste. */
+	result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
+	if (result != 0)
+		return result;
+
+	node = op->u.insert.d->coord->node;
+	assert("nikita-1039", node != NULL);
+	assert("nikita-1040", node_plugin_by_node(node) != NULL);
+
+	assert("nikita-949", space_needed_for_op(node, op) <= znode_free_space(node));
+
+	/* ask node layout to create new item. */
+	info.doing = doing;
+	info.todo = todo;
+	result = node_plugin_by_node(node)->create_item
+	    (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data, &info);
+	doing->restartable = 0;
+	znode_make_dirty(node);
+
+	return result;
+}
+
+/*
+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
+ * by slicing into multiple items.
+ */
+
+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
+
+static size_t
+item_data_overhead(carry_op * op)
+{
+	if (flow_insert_data(op)->iplug->b.estimate == NULL)
+		return 0;
+	return (flow_insert_data(op)->iplug->b.estimate(NULL /* estimate insertion */, flow_insert_data(op)) -
+		flow_insert_data(op)->length);
+}
+
+/* FIXME-VS: this is called several times during one make_flow_for_insertion
+   and it will always return the same result. Some optimization could be made
+   by calculating this value once at the beginning and passing it around. That
+   would reduce some flexibility in future changes
+*/
+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
+static size_t
+flow_insertion_overhead(carry_op * op)
+{
+	znode *node;
+	size_t insertion_overhead;
+
+	node = flow_insert_point(op)->node;
+	insertion_overhead = 0;
+	if (node->nplug->item_overhead &&
+	    !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key, flow_insert_data(op)))
+		insertion_overhead = node->nplug->item_overhead(node, 0) + item_data_overhead(op);
+	return insertion_overhead;
+}
+
+/* how many bytes of flow does fit to the node */
+static int
+what_can_fit_into_node(carry_op * op)
+{
+	size_t free, overhead;
+
+	overhead = flow_insertion_overhead(op);
+	free = znode_free_space(flow_insert_point(op)->node);
+	if (free <= overhead)
+		return 0;
+	free -= overhead;
+	/* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
+	if (free < op->u.insert_flow.flow->length)
+		return free;
+	return (int)op->u.insert_flow.flow->length;
+}
+
+/* in make_space_for_flow_insertion we need to check either whether whole flow
+   fits into a node or whether minimal fraction of flow fits into a node */
+static int
+enough_space_for_whole_flow(carry_op * op)
+{
+	return (unsigned) what_can_fit_into_node(op) == op->u.insert_flow.flow->length;
+}
+
+#define MIN_FLOW_FRACTION 1
+static int
+enough_space_for_min_flow_fraction(carry_op * op)
+{
+	assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
+
+	return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
+}
+
+/* this returns 0 if left neighbor was obtained successfully and everything
+   upto insertion point including it were shifted and left neighbor still has
+   some free space to put minimal fraction of flow into it */
+static int
+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	carry_node *left;
+	znode *orig;
+
+	left = find_left_neighbor(op, doing);
+	if (unlikely(IS_ERR(left))) {
+		warning("vs-899", "make_space_by_shift_left: " "error accessing left neighbor: %li", PTR_ERR(left));
+		return 1;
+	}
+	if (left == NULL)
+		/* left neighbor either does not exist or is unformatted
+		   node */
+		return 1;
+
+	orig = flow_insert_point(op)->node;
+	/* try to shift content of node @orig from its head upto insert point
+	   including insertion point into the left neighbor */
+	carry_shift_data(LEFT_SIDE, flow_insert_point(op),
+			 carry_real(left), doing, todo, 1 /* including insert
+							   * point */);
+	if (carry_real(left) != flow_insert_point(op)->node) {
+		/* insertion point did not move */
+		return 1;
+	}
+
+	/* insertion point is set after last item in the node */
+	assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
+
+	if (!enough_space_for_min_flow_fraction(op)) {
+		/* insertion point node does not have enough free space to put
+		   even minimal portion of flow into it, therefore, move
+		   insertion point back to orig node (before first item) */
+		coord_init_before_first_item(flow_insert_point(op), orig);
+		return 1;
+	}
+
+	/* part of flow is to be written to the end of node */
+	op->node = left;
+	return 0;
+}
+
+/* this returns 0 if right neighbor was obtained successfully and everything to
+   the right of insertion point was shifted to it and node got enough free
+   space to put minimal fraction of flow into it */
+static int
+make_space_by_shift_right(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	carry_node *right;
+
+	right = find_right_neighbor(op, doing);
+	if (unlikely(IS_ERR(right))) {
+		warning("nikita-1065", "shift_right_excluding_insert_point: "
+			"error accessing right neighbor: %li", PTR_ERR(right));
+		return 1;
+	}
+	if (right) {
+		/* shift everything possible on the right of but excluding
+		   insertion coord into the right neighbor */
+		carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
+				 carry_real(right), doing, todo, 0 /* not
+								    * including
+								    * insert
+								    * point */);
+	} else {
+		/* right neighbor either does not exist or is unformatted
+		   node */
+		;
+	}
+	if (coord_is_after_rightmost(flow_insert_point(op))) {
+		if (enough_space_for_min_flow_fraction(op)) {
+			/* part of flow is to be written to the end of node */
+			return 0;
+		}
+	}
+
+	/* new node is to be added if insert point node did not get enough
+	   space for whole flow */
+	return 1;
+}
+
+/* this returns 0 when insert coord is set at the node end and fraction of flow
+   fits into that node */
+static int
+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	int result;
+	znode *node;
+	carry_node *new;
+
+	node = flow_insert_point(op)->node;
+
+	if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
+		return RETERR(-E_NODE_FULL);
+	/* add new node after insert point node */
+	new = add_new_znode(node, op->node, doing, todo);
+	if (unlikely(IS_ERR(new))) {
+		return PTR_ERR(new);
+	}
+	result = lock_carry_node(doing, new);
+	zput(carry_real(new));
+	if (unlikely(result)) {
+		return result;
+	}
+	op->u.insert_flow.new_nodes++;
+	if (!coord_is_after_rightmost(flow_insert_point(op))) {
+		carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
+				 carry_real(new), doing, todo, 0 /* not
+								  * including
+								  * insert
+								  * point */);
+
+		assert("vs-901", coord_is_after_rightmost(flow_insert_point(op)));
+
+		if (enough_space_for_min_flow_fraction(op)) {
+			return 0;
+		}
+		if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
+			return RETERR(-E_NODE_FULL);
+
+		/* add one more new node */
+		new = add_new_znode(node, op->node, doing, todo);
+		if (unlikely(IS_ERR(new))) {
+			return PTR_ERR(new);
+		}
+		result = lock_carry_node(doing, new);
+		zput(carry_real(new));
+		if (unlikely(result)) {
+			return result;
+		}
+		op->u.insert_flow.new_nodes++;
+	}
+
+	/* move insertion point to new node */
+	coord_init_before_first_item(flow_insert_point(op), carry_real(new));
+	op->node = new;
+	return 0;
+}
+
+static int
+make_space_for_flow_insertion(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	__u32 flags = op->u.insert_flow.flags;
+
+	if (enough_space_for_whole_flow(op)) {
+		/* whole flow fits into insert point node */
+		return 0;
+	}
+
+	if (!(flags & COPI_DONT_SHIFT_LEFT) && (make_space_by_shift_left(op, doing, todo) == 0)) {
+		/* insert point is shifted to left neighbor of original insert
+		   point node and is set after last unit in that node. It has
+		   enough space to fit at least minimal fraction of flow. */
+		return 0;
+	}
+
+	if (enough_space_for_whole_flow(op)) {
+		/* whole flow fits into insert point node */
+		return 0;
+	}
+
+	if (!(flags & COPI_DONT_SHIFT_RIGHT) && (make_space_by_shift_right(op, doing, todo) == 0)) {
+		/* insert point is still set to the same node, but there is
+		   nothing to the right of insert point. */
+		return 0;
+	}
+
+	if (enough_space_for_whole_flow(op)) {
+		/* whole flow fits into insert point node */
+		return 0;
+	}
+
+	return make_space_by_new_nodes(op, doing, todo);
+}
+
+/* implements COP_INSERT_FLOW operation */
+static int
+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	int result;
+	flow_t *f;
+	coord_t *insert_point;
+	node_plugin *nplug;
+	int something_written;
+	carry_plugin_info info;
+	znode *orig_node;
+	lock_handle *orig_lh;
+
+	f = op->u.insert_flow.flow;
+	result = 0;
+
+	/* this flag is used to distinguish a need to have carry to propagate
+	   leaf level modifications up in the tree when make_space fails not in
+	   first iteration of the loop below */
+	something_written = 0;
+
+	/* carry system needs this to work */
+	info.doing = doing;
+	info.todo = todo;
+
+	orig_node = flow_insert_point(op)->node;
+	orig_lh = doing->tracked;
+
+	while (f->length) {
+		result = make_space_for_flow_insertion(op, doing, todo);
+		if (result)
+			break;
+
+		insert_point = flow_insert_point(op);
+		nplug = node_plugin_by_node(insert_point->node);
+
+		/* compose item data for insertion/pasting */
+		flow_insert_data(op)->data = f->data;
+		flow_insert_data(op)->length = what_can_fit_into_node(op);
+
+		if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
+			/* insert point is set to item of file we are writing to and we have to append to it */
+			assert("vs-903", insert_point->between == AFTER_UNIT);
+			nplug->change_item_size(insert_point, flow_insert_data(op)->length);
+			flow_insert_data(op)->iplug->b.paste(insert_point, flow_insert_data(op), &info);
+		} else {
+			/* new item must be inserted */
+			pos_in_node_t new_pos;
+			flow_insert_data(op)->length += item_data_overhead(op);
+
+			/* FIXME-VS: this is because node40_create_item changes
+			   insert_point for obscure reasons */
+			switch (insert_point->between) {
+			case AFTER_ITEM:
+				new_pos = insert_point->item_pos + 1;
+				break;
+			case EMPTY_NODE:
+				new_pos = 0;
+				break;
+			case BEFORE_ITEM:
+				assert("vs-905", insert_point->item_pos == 0);
+				new_pos = 0;
+				break;
+			default:
+				impossible("vs-906", "carry_insert_flow: invalid coord");
+				new_pos = 0;
+				break;
+			}
+
+			nplug->create_item(insert_point, &f->key, flow_insert_data(op), &info);
+			coord_set_item_pos(insert_point, new_pos);
+		}
+		coord_init_after_item_end(insert_point);
+		doing->restartable = 0;
+		znode_make_dirty(insert_point->node);
+
+		move_flow_forward(f, (unsigned) flow_insert_data(op)->length);
+		something_written = 1;
+	}
+
+	if (orig_node != flow_insert_point(op)->node) {
+		/* move lock to new insert point */
+		done_lh(orig_lh);
+		init_lh(orig_lh);
+		result = longterm_lock_znode(orig_lh, flow_insert_point(op)->node, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
+	}
+
+	return result;
+}
+
+/* implements COP_DELETE operation
+
+   Remove pointer to @op -> u.delete.child from it's parent.
+
+   This function also handles killing of a tree root is last pointer from it
+   was removed. This is complicated by our handling of "twig" level: root on
+   twig level is never killed.
+
+*/
+static int
+carry_delete(carry_op * op /* operation to be performed */ ,
+	     carry_level * doing UNUSED_ARG	/* current carry
+						 * level */ ,
+	     carry_level * todo /* next carry level */ )
+{
+	int result;
+	coord_t coord;
+	coord_t coord2;
+	znode *parent;
+	znode *child;
+	carry_plugin_info info;
+	reiser4_tree *tree;
+
+	/*
+	 * This operation is called to delete internal item pointing to the
+	 * child node that was removed by carry from the tree on the previous
+	 * tree level.
+	 */
+
+	assert("nikita-893", op != NULL);
+	assert("nikita-894", todo != NULL);
+	assert("nikita-895", op->op == COP_DELETE);
+	trace_stamp(TRACE_CARRY);
+	reiser4_stat_level_inc(doing, delete);
+
+	coord_init_zero(&coord);
+	coord_init_zero(&coord2);
+
+	parent = carry_real(op->node);
+	child = op->u.delete.child ?
+		carry_real(op->u.delete.child) : op->node->node;
+	tree = znode_get_tree(child);
+	RLOCK_TREE(tree);
+
+	/*
+	 * @parent was determined when carry entered parent level
+	 * (lock_carry_level/lock_carry_node). Since then, actual parent of
+	 * @child node could change due to other carry operations performed on
+	 * the parent level. Check for this.
+	 */
+
+	if (znode_parent(child) != parent) {
+		/* NOTE-NIKITA add stat counter for this. */
+		parent = znode_parent(child);
+		assert("nikita-2581", find_carry_node(doing, parent));
+	}
+	RUNLOCK_TREE(tree);
+
+	assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
+
+	/* Twig level horrors: tree should be of height at least 2. So, last
+	   pointer from the root at twig level is preserved even if child is
+	   empty. This is ugly, but so it was architectured.
+	*/
+
+	if (znode_is_root(parent) &&
+	    znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
+	    node_num_items(parent) == 1) {
+		/* Delimiting key manipulations. */
+		WLOCK_DK(tree);
+		znode_set_ld_key(child, znode_set_ld_key(parent, min_key()));
+		znode_set_rd_key(child, znode_set_rd_key(parent, max_key()));
+		WUNLOCK_DK(tree);
+
+		/* @child escaped imminent death! */
+		ZF_CLR(child, JNODE_HEARD_BANSHEE);
+		return 0;
+	}
+
+	/* convert child pointer to the coord_t */
+	result = find_child_ptr(parent, child, &coord);
+	if (result != NS_FOUND) {
+		warning("nikita-994", "Cannot find child pointer: %i", result);
+		print_znode("child", child);
+		print_znode("parent", parent);
+		print_coord_content("coord", &coord);
+		return result;
+	}
+
+	coord_dup(&coord2, &coord);
+	info.doing = doing;
+	info.todo = todo;
+	{
+		/*
+		 * Actually kill internal item: prepare structure with
+		 * arguments for ->cut_and_kill() method...
+		 */
+
+		struct carry_kill_data kdata;
+		kdata.params.from = &coord;
+		kdata.params.to = &coord2;
+		kdata.params.from_key = NULL;
+		kdata.params.to_key = NULL;
+		kdata.params.smallest_removed = NULL;
+		kdata.flags = op->u.delete.flags;
+		kdata.inode = 0;
+		kdata.left = 0;
+		kdata.right = 0;
+		/* ... and call it. */
+		result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
+								   &info);
+	}
+	doing->restartable = 0;
+
+	/* check whether root should be killed violently */
+	if (znode_is_root(parent) &&
+	    /* don't kill roots at and lower than twig level */
+	    znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
+	    node_num_items(parent) == 1) {
+		result = kill_tree_root(coord.node);
+	}
+
+	return result < 0 ? : 0;
+}
+
+/* implements COP_CUT opration
+
+   Cuts part or whole content of node.
+
+*/
+static int
+carry_cut(carry_op * op /* operation to be performed */ ,
+	  carry_level * doing	/* current carry level */ ,
+	  carry_level * todo /* next carry level */ )
+{
+	int result;
+	carry_plugin_info info;
+	node_plugin *nplug;
+
+	assert("nikita-896", op != NULL);
+	assert("nikita-897", todo != NULL);
+	assert("nikita-898", op->op == COP_CUT);
+	trace_stamp(TRACE_CARRY);
+	reiser4_stat_level_inc(doing, cut);
+
+	info.doing = doing;
+	info.todo = todo;
+
+	nplug = node_plugin_by_node(carry_real(op->node));
+	if (op->u.cut_or_kill.is_cut)
+		result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
+	else
+		result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
+
+	doing->restartable = 0;
+	return result < 0 ? : 0;
+}
+
+/* helper function for carry_paste(): returns true if @op can be continued as
+   paste  */
+static int
+can_paste(coord_t * icoord, const reiser4_key * key, const reiser4_item_data * data)
+{
+	coord_t circa;
+	item_plugin *new_iplug;
+	item_plugin *old_iplug;
+	int result = 0;		/* to keep gcc shut */
+
+	assert("", icoord->between != AT_UNIT);
+
+	/* obviously, one cannot paste when node is empty---there is nothing
+	   to paste into. */
+	if (node_is_empty(icoord->node))
+		return 0;
+	/* if insertion point is at the middle of the item, then paste */
+	if (!coord_is_between_items(icoord))
+		return 1;
+	coord_dup(&circa, icoord);
+	circa.between = AT_UNIT;
+
+	old_iplug = item_plugin_by_coord(&circa);
+	new_iplug = data->iplug;
+
+	/* check whether we can paste to the item @icoord is "at" when we
+	   ignore ->between field */
+	if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
+		result = 1;
+	} else if (icoord->between == BEFORE_UNIT || icoord->between == BEFORE_ITEM) {
+		/* otherwise, try to glue to the item at the left, if any */
+		coord_dup(&circa, icoord);
+		if (coord_set_to_left(&circa)) {
+			result = 0;
+			coord_init_before_item(icoord);
+		} else {
+			old_iplug = item_plugin_by_coord(&circa);
+			result = (old_iplug == new_iplug) && item_can_contain_key(icoord, key, data);
+			if (result) {
+				coord_dup(icoord, &circa);
+				icoord->between = AFTER_UNIT;
+			}
+		}
+	} else if (icoord->between == AFTER_UNIT || icoord->between == AFTER_ITEM) {
+		coord_dup(&circa, icoord);
+		/* otherwise, try to glue to the item at the right, if any */
+		if (coord_set_to_right(&circa)) {
+			result = 0;
+			coord_init_after_item(icoord);
+		} else {
+			int (*cck) (const coord_t *, const reiser4_key *, const reiser4_item_data *);
+
+			old_iplug = item_plugin_by_coord(&circa);
+
+			cck = old_iplug->b.can_contain_key;
+			if (cck == NULL)
+				/* item doesn't define ->can_contain_key
+				   method? So it is not expandable. */
+				result = 0;
+			else {
+				result = (old_iplug == new_iplug) && cck(&circa /*icoord */ , key, data);
+				if (result) {
+					coord_dup(icoord, &circa);
+					icoord->between = BEFORE_UNIT;
+				}
+			}
+		}
+	} else
+		impossible("nikita-2513", "Nothing works");
+	if (result) {
+		if (icoord->between == BEFORE_ITEM) {
+			assert("vs-912", icoord->unit_pos == 0);
+			icoord->between = BEFORE_UNIT;
+		} else if (icoord->between == AFTER_ITEM) {
+			coord_init_after_item_end(icoord);
+		}
+	}
+	return result;
+}
+
+/* implements COP_PASTE operation
+
+   Paste data into existing item. This is complicated by the fact that after
+   we shifted something to the left or right neighbors trying to free some
+   space, item we were supposed to paste into can be in different node than
+   insertion coord. If so, we are no longer doing paste, but insert. See
+   comments in insert_paste_common().
+
+*/
+static int
+carry_paste(carry_op * op /* operation to be performed */ ,
+	    carry_level * doing UNUSED_ARG	/* current carry
+						 * level */ ,
+	    carry_level * todo /* next carry level */ )
+{
+	znode *node;
+	carry_insert_data cdata;
+	coord_t dcoord;
+	reiser4_item_data data;
+	int result;
+	int real_size;
+	item_plugin *iplug;
+	carry_plugin_info info;
+	coord_t *coord;
+
+	assert("nikita-982", op != NULL);
+	assert("nikita-983", todo != NULL);
+	assert("nikita-984", op->op == COP_PASTE);
+
+	trace_stamp(TRACE_CARRY);
+	reiser4_stat_level_inc(doing, paste);
+
+	coord_init_zero(&dcoord);
+
+	result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
+	if (result != 0)
+		return result;
+
+	coord = op->u.insert.d->coord;
+
+	/* handle case when op -> u.insert.coord doesn't point to the item
+	   of required type. restart as insert. */
+	if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
+		op->op = COP_INSERT;
+		op->u.insert.type = COPT_PASTE_RESTARTED;
+		reiser4_stat_level_inc(doing, paste_restarted);
+		result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
+
+		return result;
+	}
+
+	node = coord->node;
+	iplug = item_plugin_by_coord(coord);
+	assert("nikita-992", iplug != NULL);
+
+	assert("nikita-985", node != NULL);
+	assert("nikita-986", node_plugin_by_node(node) != NULL);
+
+	assert("nikita-987", space_needed_for_op(node, op) <= znode_free_space(node));
+
+	assert("nikita-1286", coord_is_existing_item(coord));
+
+	/*
+	 * if item is expanded as a result of this operation, we should first
+	 * change item size, than call ->b.paste item method. If item is
+	 * shrunk, it should be done other way around: first call ->b.paste
+	 * method, then reduce item size.
+	 */
+
+	real_size = space_needed_for_op(node, op);
+	if (real_size > 0)
+		node->nplug->change_item_size(coord, real_size);
+
+	doing->restartable = 0;
+	info.doing = doing;
+	info.todo = todo;
+
+	result = iplug->b.paste(coord, op->u.insert.d->data, &info);
+
+	if (real_size < 0)
+		node->nplug->change_item_size(coord, real_size);
+
+	/* if we pasted at the beginning of the item, update item's key. */
+	if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
+		node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
+
+	znode_make_dirty(node);
+	return result;
+}
+
+/* handle carry COP_EXTENT operation. */
+static int
+carry_extent(carry_op * op /* operation to perform */ ,
+	     carry_level * doing	/* queue of operations @op
+					 * is part of */ ,
+	     carry_level * todo	/* queue where new operations
+				 * are accumulated */ )
+{
+	znode *node;
+	carry_insert_data cdata;
+	coord_t coord;
+	reiser4_item_data data;
+	carry_op *delete_dummy;
+	carry_op *insert_extent;
+	int result;
+	carry_plugin_info info;
+
+	assert("nikita-1751", op != NULL);
+	assert("nikita-1752", todo != NULL);
+	assert("nikita-1753", op->op == COP_EXTENT);
+
+	trace_stamp(TRACE_CARRY);
+	reiser4_stat_level_inc(doing, extent);
+
+	/* extent insertion overview:
+
+	   extents live on the TWIG LEVEL, which is level one above the leaf
+	   one. This complicates extent insertion logic somewhat: it may
+	   happen (and going to happen all the time) that in logical key
+	   ordering extent has to be placed between items I1 and I2, located
+	   at the leaf level, but I1 and I2 are in the same formatted leaf
+	   node N1. To insert extent one has to
+
+	    (1) reach node N1 and shift data between N1, its neighbors and
+	    possibly newly allocated nodes until I1 and I2 fall into different
+	    nodes. Since I1 and I2 are still neighboring items in logical key
+	    order, they will be necessary utmost items in their respective
+	    nodes.
+
+	    (2) After this new extent item is inserted into node on the twig
+	    level.
+
+	   Fortunately this process can reuse almost all code from standard
+	   insertion procedure (viz. make_space() and insert_paste_common()),
+	   due to the following observation: make_space() only shifts data up
+	   to and excluding or including insertion point. It never
+	   "over-moves" through insertion point. Thus, one can use
+	   make_space() to perform step (1). All required for this is just to
+	   instruct free_space_shortage() to keep make_space() shifting data
+	   until insertion point is at the node border.
+
+	*/
+
+	/* perform common functionality of insert and paste. */
+	result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
+	if (result != 0)
+		return result;
+
+	node = op->u.extent.d->coord->node;
+	assert("nikita-1754", node != NULL);
+	assert("nikita-1755", node_plugin_by_node(node) != NULL);
+	assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
+
+	/* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
+	   extent fits between items. */
+
+	info.doing = doing;
+	info.todo = todo;
+
+	/* there is another complication due to placement of extents on the
+	   twig level: extents are "rigid" in the sense that key-range
+	   occupied by extent cannot grow indefinitely to the right as it is
+	   for the formatted leaf nodes. Because of this when search finds two
+	   adjacent extents on the twig level, it has to "drill" to the leaf
+	   level, creating new node. Here we are removing this node.
+	*/
+	if (node_is_empty(node)) {
+		delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
+		if (IS_ERR(delete_dummy))
+			return PTR_ERR(delete_dummy);
+		delete_dummy->u.delete.child = NULL;
+		delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
+		ZF_SET(node, JNODE_HEARD_BANSHEE);
+	}
+
+	/* proceed with inserting extent item into parent. We are definitely
+	   inserting rather than pasting if we get that far. */
+	insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
+	if (IS_ERR(insert_extent))
+		/* @delete_dummy will be automatically destroyed on the level
+		   exiting  */
+		return PTR_ERR(insert_extent);
+	/* NOTE-NIKITA insertion by key is simplest option here. Another
+	   possibility is to insert on the left or right of already existing
+	   item.
+	*/
+	insert_extent->u.insert.type = COPT_KEY;
+	insert_extent->u.insert.d = op->u.extent.d;
+	assert("nikita-1719", op->u.extent.d->key != NULL);
+	insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
+	insert_extent->u.insert.flags = znode_get_tree(node)->carry.new_extent_flags;
+
+	/*
+	 * if carry was asked to track lock handle we should actually track
+	 * lock handle on the twig node rather than on the leaf where
+	 * operation was started from. Transfer tracked lock handle.
+	 */
+	if (doing->track_type) {
+		assert("nikita-3242", doing->tracked != NULL);
+		assert("nikita-3244", todo->tracked == NULL);
+		todo->tracked = doing->tracked;
+		todo->track_type = CARRY_TRACK_NODE;
+		doing->tracked = NULL;
+		doing->track_type = 0;
+	}
+
+	return 0;
+}
+
+/* update key in @parent between pointers to @left and @right.
+
+   Find coords of @left and @right and update delimiting key between them.
+   This is helper function called by carry_update(). Finds position of
+   internal item involved. Updates item key. Updates delimiting keys of child
+   nodes involved.
+*/
+static int
+update_delimiting_key(znode * parent	/* node key is updated
+					 * in */ ,
+		      znode * left /* child of @parent */ ,
+		      znode * right /* child of @parent */ ,
+		      carry_level * doing	/* current carry
+						 * level */ ,
+		      carry_level * todo	/* parent carry
+						 * level */ ,
+		      const char **error_msg	/* place to
+						 * store error
+						 * message */ )
+{
+	coord_t left_pos;
+	coord_t right_pos;
+	int result;
+	reiser4_key ldkey;
+	carry_plugin_info info;
+
+	assert("nikita-1177", right != NULL);
+	/* find position of right left child in a parent */
+	result = find_child_ptr(parent, right, &right_pos);
+	if (result != NS_FOUND) {
+		*error_msg = "Cannot find position of right child";
+		return result;
+	}
+
+	if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
+		/* find position of the left child in a parent */
+		result = find_child_ptr(parent, left, &left_pos);
+		if (result != NS_FOUND) {
+			*error_msg = "Cannot find position of left child";
+			return result;
+		}
+		assert("nikita-1355", left_pos.node != NULL);
+	} else
+		left_pos.node = NULL;
+
+	/* check that they are separated by exactly one key and are basically
+	   sane */
+	if (REISER4_DEBUG) {
+		if ((left_pos.node != NULL)
+		    && !coord_is_existing_unit(&left_pos)) {
+			*error_msg = "Left child is bastard";
+			return RETERR(-EIO);
+		}
+		if (!coord_is_existing_unit(&right_pos)) {
+			*error_msg = "Right child is bastard";
+			return RETERR(-EIO);
+		}
+		if (left_pos.node != NULL &&
+		    !coord_are_neighbors(&left_pos, &right_pos)) {
+			*error_msg = "Children are not direct siblings";
+			return RETERR(-EIO);
+		}
+	}
+	*error_msg = NULL;
+
+	info.doing = doing;
+	info.todo = todo;
+
+	/*
+	 * If child node is not empty, new key of internal item is a key of
+	 * leftmost item in the child node. If the child is empty, take its
+	 * right delimiting key as a new key of the internal item. Precise key
+	 * in the latter case is not important per se, because the child (and
+	 * the internal item) are going to be killed shortly anyway, but we
+	 * have to preserve correct order of keys in the parent node.
+	 */
+
+	if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
+		leftmost_key_in_node(right, &ldkey);
+	else
+		UNDER_RW_VOID(dk, znode_get_tree(parent), read,
+			      ldkey = *znode_get_rd_key(right));
+	node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
+	doing->restartable = 0;
+	znode_make_dirty(parent);
+	return 0;
+}
+
+/* implements COP_UPDATE opration
+
+   Update delimiting keys.
+
+*/
+static int
+carry_update(carry_op * op /* operation to be performed */ ,
+	     carry_level * doing /* current carry level */ ,
+	     carry_level * todo /* next carry level */ )
+{
+	int result;
+	carry_node *missing UNUSED_ARG;
+	znode *left;
+	znode *right;
+	carry_node *lchild;
+	carry_node *rchild;
+	const char *error_msg;
+	reiser4_tree *tree;
+
+	/*
+	 * This operation is called to update key of internal item. This is
+	 * necessary when carry shifted of cut data on the child
+	 * level. Arguments of this operation are:
+	 *
+	 *     @right --- child node. Operation should update key of internal
+	 *     item pointing to @right.
+	 *
+	 *     @left --- left neighbor of @right. This parameter is optional.
+	 */
+
+	assert("nikita-902", op != NULL);
+	assert("nikita-903", todo != NULL);
+	assert("nikita-904", op->op == COP_UPDATE);
+	trace_stamp(TRACE_CARRY);
+	reiser4_stat_level_inc(doing, update);
+
+	lchild = op->u.update.left;
+	rchild = op->node;
+
+	if (lchild != NULL) {
+		assert("nikita-1001", lchild->parent);
+		assert("nikita-1003", !lchild->left);
+		left = carry_real(lchild);
+	} else
+		left = NULL;
+
+	tree = znode_get_tree(rchild->node);
+	RLOCK_TREE(tree);
+	right = znode_parent(rchild->node);
+	if (REISER4_STATS) {
+		znode *old_right;
+		if (rchild != NULL) {
+			assert("nikita-1000", rchild->parent);
+			assert("nikita-1002", !rchild->left);
+			old_right = carry_real(rchild);
+		} else
+			old_right = NULL;
+		if (znode_parent(rchild->node) != old_right)
+			/* parent node was split, and pointer to @rchild was
+			   inserted/moved into new node. Wonders of balkancing
+			   (sic.).
+			*/
+			reiser4_stat_level_inc(doing, half_split_race);
+	}
+	RUNLOCK_TREE(tree);
+
+	if (right != NULL) {
+		result = update_delimiting_key(right,
+					       lchild ? lchild->node : NULL,
+					       rchild->node,
+					       doing, todo, &error_msg);
+	} else {
+		error_msg = "Cannot find node to update key in";
+		result = RETERR(-EIO);
+	}
+	/* operation will be reposted to the next level by the
+	   ->update_item_key() method of node plugin, if necessary. */
+
+	if (result != 0) {
+		warning("nikita-999", "Error updating delimiting key: %s (%i)", error_msg ? : "", result);
+		print_znode("left", left);
+		print_znode("right", right);
+		print_znode("lchild", lchild ? lchild->node : NULL);
+		print_znode("rchild", rchild->node);
+	}
+	return result;
+}
+
+/* move items from @node during carry */
+static int
+carry_shift_data(sideof side /* in what direction to move data */ ,
+		 coord_t * insert_coord	/* coord where new item
+					   * is to be inserted */ ,
+		 znode * node /* node which data are moved from */ ,
+		 carry_level * doing /* active carry queue */ ,
+		 carry_level * todo	/* carry queue where new
+					 * operations are to be put
+					 * in */ ,
+		 unsigned int including_insert_coord_p	/* true if
+							 * @insertion_coord
+							 * can be moved */ )
+{
+	int result;
+	znode *source;
+	carry_plugin_info info;
+	node_plugin *nplug;
+
+	source = insert_coord->node;
+
+	info.doing = doing;
+	info.todo = todo;
+
+	nplug = node_plugin_by_node(node);
+	result = nplug->shift(insert_coord, node,
+			      (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
+			      (int) including_insert_coord_p, &info);
+	/* the only error ->shift() method of node plugin can return is
+	   -ENOMEM due to carry node/operation allocation. */
+	assert("nikita-915", result >= 0 || result == -ENOMEM);
+	if (result > 0) {
+		/*
+		 * if some number of bytes was actually shifted, mark nodes
+		 * dirty, and carry level as non-restartable.
+		 */
+		doing->restartable = 0;
+		znode_make_dirty(source);
+		znode_make_dirty(node);
+	}
+
+	assert("nikita-2077", coord_check(insert_coord));
+	return 0;
+}
+
+typedef carry_node *(*carry_iterator) (carry_node * node);
+static carry_node *find_dir_carry(carry_node * node, carry_level * level, carry_iterator iterator);
+
+/* look for the left neighbor of given carry node in a carry queue.
+
+   This is used by find_left_neighbor(), but I am not sure that this
+   really gives any advantage. More statistics required.
+
+*/
+reiser4_internal carry_node *
+find_left_carry(carry_node * node	/* node to fine left neighbor
+					 * of */ ,
+		carry_level * level /* level to scan */ )
+{
+	return find_dir_carry(node, level, (carry_iterator) pool_level_list_prev);
+}
+
+/* look for the right neighbor of given carry node in a
+   carry queue.
+
+   This is used by find_right_neighbor(), but I am not sure that this
+   really gives any advantage. More statistics required.
+
+*/
+reiser4_internal carry_node *
+find_right_carry(carry_node * node	/* node to fine right neighbor
+					   * of */ ,
+		 carry_level * level /* level to scan */ )
+{
+	return find_dir_carry(node, level, (carry_iterator) pool_level_list_next);
+}
+
+/* look for the left or right neighbor of given carry node in a carry
+   queue.
+
+   Helper function used by find_{left|right}_carry().
+*/
+static carry_node *
+find_dir_carry(carry_node * node	/* node to start scanning
+					 * from */ ,
+	       carry_level * level /* level to scan */ ,
+	       carry_iterator iterator	/* operation to
+					 * move to the next
+					 * node */ )
+{
+	carry_node *neighbor;
+
+	assert("nikita-1059", node != NULL);
+	assert("nikita-1060", level != NULL);
+
+	/* scan list of carry nodes on this list dir-ward, skipping all
+	   carry nodes referencing the same znode. */
+	neighbor = node;
+	while (1) {
+		neighbor = iterator(neighbor);
+		if (pool_level_list_end(&level->nodes, &neighbor->header))
+			return NULL;
+		if (carry_real(neighbor) != carry_real(node))
+			return neighbor;
+	}
+}
+
+/*
+ * Memory reservation estimation.
+ *
+ * Carry process proceeds through tree levels upwards. Carry assumes that it
+ * takes tree in consistent state (e.g., that search tree invariants hold),
+ * and leaves tree consistent after it finishes. This means that when some
+ * error occurs carry cannot simply return if there are pending carry
+ * operations. Generic solution for this problem is carry-undo either as
+ * transaction manager feature (requiring checkpoints and isolation), or
+ * through some carry specific mechanism.
+ *
+ * Our current approach is to panic if carry hits an error while tree is
+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
+ * this "memory reservation" mechanism was added.
+ *
+ * Memory reservation is implemented by perthread-pages.diff patch from
+ * core-patches. Its API is defined in <linux/gfp.h>
+ *
+ *     int  perthread_pages_reserve(int nrpages, int gfp);
+ *     void perthread_pages_release(int nrpages);
+ *     int  perthread_pages_count(void);
+ *
+ * carry estimates its worst case memory requirements at the entry, reserved
+ * enough memory, and released unused pages before returning.
+ *
+ * Code below estimates worst case memory requirements for a given carry
+ * queue. This is dome by summing worst case memory requirements for each
+ * operation in the queue.
+ *
+ */
+
+/*
+ * Memory memory requirements of many operations depends on the tree
+ * height. For example, item insertion requires new node to be inserted at
+ * each tree level in the worst case. What tree height should be used for
+ * estimation? Current tree height is wrong, because tree height can change
+ * between the time when estimation was done and the time when operation is
+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
+ * is also not desirable, because it would lead to the huge over-estimation
+ * all the time. Plausible solution is "capped tree height": if current tree
+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
+ * to be increased even more during short interval of time.
+ */
+#define TREE_HEIGHT_CAP (5)
+
+/* return capped tree height for the @tree. See comment above. */
+static int
+cap_tree_height(reiser4_tree * tree)
+{
+	return max_t(int, tree->height, TREE_HEIGHT_CAP);
+}
+
+/* return capped tree height for the current tree. */
+static int capped_height(void)
+{
+	return cap_tree_height(current_tree);
+}
+
+/* return number of pages required to store given number of bytes */
+static int bytes_to_pages(int bytes)
+{
+	return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+/* how many pages are required to allocate znodes during item insertion. */
+static int
+carry_estimate_znodes(void)
+{
+	/*
+	 * Note, that there we have some problem here: there is no way to
+	 * reserve pages specifically for the given slab. This means that
+	 * these pages can be hijacked for some other end.
+	 */
+
+	/* in the worst case we need 3 new znode on each tree level */
+	return bytes_to_pages(capped_height() * sizeof(znode) * 3);
+}
+
+/*
+ * how many pages are required to load bitmaps. One bitmap per level.
+ */
+static int
+carry_estimate_bitmaps(void)
+{
+	if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
+		int bytes;
+
+		bytes = capped_height() *
+			(0 +   /* bnode should be added, but its is private to
+				* bitmap.c, skip for now. */
+			 2 * sizeof(jnode));      /* working and commit jnodes */
+		return bytes_to_pages(bytes) + 2; /* and their contents */
+	} else
+		/* bitmaps were pre-loaded during mount */
+		return 0;
+}
+
+/* worst case item insertion memory requirements */
+static int
+carry_estimate_insert(carry_op * op, carry_level * level)
+{
+	return
+		carry_estimate_bitmaps() +
+		carry_estimate_znodes() +
+		1 + /* new atom */
+		capped_height() + /* new block on each level */
+		1 + /* and possibly extra new block at the leaf level */
+		3; /* loading of leaves into memory */
+}
+
+/* worst case item deletion memory requirements */
+static int
+carry_estimate_delete(carry_op * op, carry_level * level)
+{
+	return
+		carry_estimate_bitmaps() +
+		carry_estimate_znodes() +
+		1 + /* new atom */
+		3; /* loading of leaves into memory */
+}
+
+/* worst case tree cut memory requirements */
+static int
+carry_estimate_cut(carry_op * op, carry_level * level)
+{
+	return
+		carry_estimate_bitmaps() +
+		carry_estimate_znodes() +
+		1 + /* new atom */
+		3; /* loading of leaves into memory */
+}
+
+/* worst case memory requirements of pasting into item */
+static int
+carry_estimate_paste(carry_op * op, carry_level * level)
+{
+	return
+		carry_estimate_bitmaps() +
+		carry_estimate_znodes() +
+		1 + /* new atom */
+		capped_height() + /* new block on each level */
+		1 + /* and possibly extra new block at the leaf level */
+		3; /* loading of leaves into memory */
+}
+
+/* worst case memory requirements of extent insertion */
+static int
+carry_estimate_extent(carry_op * op, carry_level * level)
+{
+	return
+		carry_estimate_insert(op, level) + /* insert extent */
+		carry_estimate_delete(op, level);  /* kill leaf */
+}
+
+/* worst case memory requirements of key update */
+static int
+carry_estimate_update(carry_op * op, carry_level * level)
+{
+	return 0;
+}
+
+/* worst case memory requirements of flow insertion */
+static int
+carry_estimate_insert_flow(carry_op * op, carry_level * level)
+{
+	int newnodes;
+
+	newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
+		       CARRY_FLOW_NEW_NODES_LIMIT);
+	/*
+	 * roughly estimate insert_flow as a sequence of insertions.
+	 */
+	return newnodes * carry_estimate_insert(op, level);
+}
+
+/* This is dispatch table for carry operations. It can be trivially
+   abstracted into useful plugin: tunable balancing policy is a good
+   thing. */
+reiser4_internal carry_op_handler op_dispatch_table[COP_LAST_OP] = {
+	[COP_INSERT] = {
+		.handler = carry_insert,
+		.estimate = carry_estimate_insert
+	},
+	[COP_DELETE] = {
+		.handler = carry_delete,
+		.estimate = carry_estimate_delete
+	},
+	[COP_CUT] = {
+		.handler = carry_cut,
+		.estimate = carry_estimate_cut
+	},
+	[COP_PASTE] = {
+		.handler = carry_paste,
+		.estimate = carry_estimate_paste
+	},
+	[COP_EXTENT] = {
+		.handler = carry_extent,
+		.estimate = carry_estimate_extent
+	},
+	[COP_UPDATE] = {
+		.handler = carry_update,
+		.estimate = carry_estimate_update
+	},
+	[COP_INSERT_FLOW] = {
+		.handler = carry_insert_flow,
+		.estimate = carry_estimate_insert_flow
+	}
+};
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/carry_ops.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/carry_ops.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/carry_ops.h	2004-08-22 19:35:33.616653494 +1000
@@ -0,0 +1,41 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* implementation of carry operations. See carry_ops.c for details. */
+
+#if !defined( __CARRY_OPS_H__ )
+#define __CARRY_OPS_H__
+
+#include "forward.h"
+#include "znode.h"
+#include "carry.h"
+
+/* carry operation handlers */
+typedef struct carry_op_handler {
+	/* perform operation */
+	int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
+	/* estimate memory requirements for @op */
+	int (*estimate) (carry_op * op, carry_level * level);
+} carry_op_handler;
+
+/* This is dispatch table for carry operations. It can be trivially
+   abstracted into useful plugin: tunable balancing policy is a good
+   thing. */
+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
+
+unsigned int space_needed(const znode * node, const coord_t * coord, const reiser4_item_data * data, int inserting);
+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
+
+/* __CARRY_OPS_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/cluster.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/cluster.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/cluster.c	2004-08-22 19:35:33.617653335 +1000
@@ -0,0 +1,71 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Contains cluster operations for cryptcompress object plugin (see
+   http://www.namesys.com/cryptcompress_design.txt for details). */
+
+/*         Concepts of clustering. Definition of cluster size.
+	   Data clusters, page clusters, disk clusters.
+
+
+   In order to compress plain text we first should split it into chunks.
+   Then we process each chunk independently by the following function:
+
+   void alg(char *input_ptr, int input_length, char *output_ptr, int *output_length);
+
+   where:
+   input_ptr is a pointer to the first byte of input chunk (that contains plain text),
+   input_len is a length of input chunk,
+   output_ptr is a pointer to the first byte of output chunk (that contains processed text),
+   *output_len is a length of output chunk.
+
+   the length of output chunk depends both on input_len and on the content of
+   input chunk.  input_len (which can be assigned an arbitrary value) affects the
+   compression quality (the more input_len the better the compression quality).
+   For each cryptcompress file we assign special attribute - cluster size:
+
+   Cluster size is a file attribute, which determines the maximal size
+   of input chunk that we use for compression.
+
+   So if we wanna compress a 10K-file with a cluster size of 4K, we split this file
+   into three chunks (first and second - 4K, third - 2K). Those chunks are
+   clusters in the space of file offsets (data clusters).
+
+   Cluster sizes are represented as (PAGE_CACHE_SIZE << shift), where
+   shift (= 0, 1, 2,... ).  You'll note that this representation
+   affects the allowed values for cluster size.  This is stored in
+   disk stat-data (CLUSTER_STAT, layout is in reiser4_cluster_stat (see
+   (plugin/item/static_stat.h) for details).
+   Note that working with
+   cluster_size > PAGE_SIZE (when cluster_shift > 0, and cluster contains more
+   then one page) is suboptimal because before compression we should assemble
+   all cluster pages into one flow (this means superfluous memcpy during
+   read/write). So the better way to increase cluster size (and therefore
+   compression quality) is making PAGE_SIZE larger (for instance by page
+   clustering stuff of William Lee). But if you need PAGE_SIZE < cluster_size,
+   then use the page clustering offered by reiser4.
+
+   The inode mapping of a cryptcompress file contains pages filled by plain text.
+   Cluster size also defines clustering in address space. For example,
+   101K-file with cluster size 16K (cluster shift = 2), which can be mapped
+   into 26 pages, has 7 "page clusters": first six clusters contains 4 pages
+   and one cluster contains 2 pages (for the file tail).
+
+   We split each output (compressed) chunk into special items to provide
+   tight packing of data on disk (currently only ctails hold compressed data).
+   This set of items we call a "disk cluster".
+
+   Each cluster is defined (like pages are) by its index (e.g. offset,
+   but the unit is cluster size instead of PAGE_SIZE). Key offset of
+   the first unit of the first item of each disk cluster (we call this a
+   "key of disk cluster") is a multiple of the cluster index.
+
+   All read/write/truncate operations are performed upon clusters.
+   For example, if we wanna read 40K of a cryptcompress file with cluster size 16K
+   from offset = 20K, we first need to read two clusters (of indexes 1, 2). This
+   means that all main methods of cryptcompress object plugin call appropriate
+   cluster operation.
+
+   For the same index we use one structure (type reiser4_cluster_t) to
+   represent all data/page/disk clusters.  (EDWARD-FIXME-HANS: are you
+   sure that is good style? and where is the code that goes with this comment....;-) )
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/cluster.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/cluster.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/cluster.h	2004-08-22 19:35:33.617653335 +1000
@@ -0,0 +1,182 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* This file contains page/cluster index translators and offset modulators
+   See http://www.namesys.com/cryptcompress_design.html for details */
+
+#if !defined( __FS_REISER4_CLUSTER_H__ )
+#define __FS_REISER4_CLUSTER_H__
+
+static inline loff_t min_count(loff_t a, loff_t b)
+{
+	return (a < b ? a : b);
+}
+
+static inline __u8 inode_cluster_shift (struct inode * inode)
+{
+	assert("edward-92", inode != NULL);
+	assert("edward-93", reiser4_inode_data(inode) != NULL);
+	assert("edward-94", inode_get_flag(inode, REISER4_CLUSTER_KNOWN));
+
+	return reiser4_inode_data(inode)->cluster_shift;
+}
+
+/* returns number of pages in the cluster */
+static inline int inode_cluster_pages (struct inode * inode)
+{
+	return (1 << inode_cluster_shift(inode));
+}
+
+static inline size_t inode_cluster_size (struct inode * inode)
+{
+	assert("edward-96", inode != NULL);
+
+	return (PAGE_CACHE_SIZE << inode_cluster_shift(inode));
+}
+
+static inline unsigned long
+pg_to_clust(unsigned long idx, struct inode * inode)
+{
+	return idx >> inode_cluster_shift(inode);
+}
+
+static inline unsigned long
+clust_to_pg(unsigned long idx, struct inode * inode)
+{
+	return idx << inode_cluster_shift(inode);
+}
+
+static inline unsigned long
+pg_to_clust_to_pg(unsigned long idx, struct inode * inode)
+{
+	return clust_to_pg(pg_to_clust(idx, inode), inode);
+}
+
+static inline unsigned long
+off_to_pg(loff_t off)
+{
+	return (off >> PAGE_CACHE_SHIFT);
+}
+
+static inline loff_t
+pg_to_off(unsigned long idx)
+{
+	return ((loff_t)(idx) << PAGE_CACHE_SHIFT);
+}
+
+static inline unsigned long
+off_to_clust(loff_t off, struct inode * inode)
+{
+	return pg_to_clust(off_to_pg(off), inode);
+}
+
+static inline loff_t
+clust_to_off(unsigned long idx, struct inode * inode)
+{
+	return pg_to_off(clust_to_pg(idx, inode));
+}
+
+static inline loff_t
+off_to_clust_to_off(loff_t off, struct inode * inode)
+{
+	return clust_to_off(off_to_clust(off, inode), inode);
+}
+
+static inline unsigned long
+off_to_clust_to_pg(loff_t off, struct inode * inode)
+{
+	return clust_to_pg(off_to_clust(off, inode), inode);
+}
+
+static inline unsigned
+off_to_pgoff(loff_t off)
+{
+	return off & (PAGE_CACHE_SIZE - 1);
+}
+
+static inline unsigned
+off_to_cloff(loff_t off, struct inode * inode)
+{
+	return off & ((loff_t)(inode_cluster_size(inode)) - 1);
+}
+
+static inline unsigned
+pg_to_off_to_cloff(unsigned long idx, struct inode * inode)
+{
+	return off_to_cloff(pg_to_off(idx), inode);
+}
+
+/* if @size != 0, returns index of the page
+   which contains the last byte of the file */
+static inline pgoff_t
+size_to_pg(loff_t size)
+{
+	return (size ? off_to_pg(size - 1) : 0);
+}
+
+/* minimal index of the page which doesn't contain
+   file data */
+static inline pgoff_t
+size_to_next_pg(loff_t size)
+{
+	return (size ? off_to_pg(size - 1) + 1 : 0);
+}
+
+static inline unsigned
+off_to_pgcount(loff_t off, unsigned long idx)
+{
+	if (idx > off_to_pg(off))
+		return 0;
+	if (idx < off_to_pg(off))
+		return PAGE_CACHE_SIZE;
+	return off_to_pgoff(off);
+}
+
+static inline unsigned
+off_to_count(loff_t off, unsigned long idx, struct inode * inode)
+{
+	if (idx > off_to_clust(off, inode))
+		return 0;
+	if (idx < off_to_clust(off, inode))
+		return inode_cluster_size(inode);
+	return off_to_cloff(off, inode);
+}
+
+static inline unsigned
+fsize_to_count(reiser4_cluster_t * clust, struct inode * inode)
+{
+	assert("edward-288", clust != NULL);
+	assert("edward-289", inode != NULL);
+
+	return off_to_count(inode->i_size, clust->index, inode);
+}
+
+static inline int
+alloc_clust_pages(reiser4_cluster_t * clust, struct inode * inode )
+{
+	assert("edward-791", clust != NULL);
+	assert("edward-792", inode != NULL);
+	clust->pages = reiser4_kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode), GFP_KERNEL);
+	if (!clust->pages)
+		return -ENOMEM;
+	return 0;
+}
+
+static inline void
+free_clust_pages(reiser4_cluster_t * clust)
+{
+	reiser4_kfree(clust->pages);
+}
+
+#endif /* __FS_REISER4_CLUSTER_H__ */
+
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/context.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/context.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/context.c	2004-08-22 19:35:33.618653175 +1000
@@ -0,0 +1,375 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Manipulation of reiser4_context */
+
+/*
+ * global context used during system call. Variable of this type is allocated
+ * on the stack at the beginning of the reiser4 part of the system call and
+ * pointer to it is stored in the current->fs_context. This allows us to avoid
+ * passing pointer to current transaction and current lockstack (both in
+ * one-to-one mapping with threads) all over the call chain.
+ *
+ * It's kind of like those global variables the prof used to tell you not to
+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
+ *
+ * In some situations it is desirable to have ability to enter reiser4_context
+ * more than once for the same thread (nested contexts). For example, there
+ * are some functions that can be called either directly from VFS/VM or from
+ * already active reiser4 context (->writepage, for example).
+ *
+ * In such situations "child" context acts like dummy: all activity is
+ * actually performed in the top level context, and get_current_context()
+ * always returns top level context. Of course, init_context()/done_context()
+ * have to be properly nested any way.
+ *
+ * Note that there is an important difference between reiser4 uses
+ * ->fs_context and the way other file systems use it. Other file systems
+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
+ * (this is why ->fs_context was initially called ->journal_info). This means,
+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
+ * to the file system, they assume that some transaction is already underway,
+ * and usually bail out, because starting nested transaction would most likely
+ * lead to the deadlock. This gives false positives with reiser4, because we
+ * set ->fs_context before starting transaction.
+ */
+
+#include "debug.h"
+#include "super.h"
+#include "context.h"
+
+#include <linux/writeback.h> /* balance_dirty_pages() */
+
+#if REISER4_DEBUG_CONTEXTS
+/* List of all currently active contexts, used for debugging purposes.  */
+context_list_head active_contexts;
+/* lock protecting access to active_contexts. */
+spinlock_t active_contexts_lock;
+
+void
+check_contexts(void)
+{
+	reiser4_context *ctx;
+
+	spin_lock(&active_contexts_lock);
+	for_all_type_safe_list(context, &active_contexts, ctx) {
+		assert("vs-$BIGNUM", ctx->magic == context_magic);
+	}
+	spin_unlock(&active_contexts_lock);
+}
+/* REISER4_DEBUG_CONTEXTS */
+#endif
+
+struct {
+	void *task;
+	void *context;
+	void *path[16];
+} context_ok;
+
+
+
+reiser4_internal void get_context_ok(reiser4_context *ctx)
+{
+	int i;
+	void *addr = NULL, *frame = NULL;
+
+#define CTX_FRAME(nr)						\
+	case (nr):						\
+		addr  = __builtin_return_address((nr));	 	\
+                frame = __builtin_frame_address(nr);		\
+		break
+
+	memset(&context_ok, 0, sizeof(context_ok));
+
+	context_ok.task = current;
+	context_ok.context = ctx;
+	for (i = 0; i < 16; i ++) {
+		switch(i) {
+			CTX_FRAME(0);
+			CTX_FRAME(1);
+			CTX_FRAME(2);
+			CTX_FRAME(3);
+			CTX_FRAME(4);
+			CTX_FRAME(5);
+			CTX_FRAME(6);
+			CTX_FRAME(7);
+			CTX_FRAME(8);
+			CTX_FRAME(9);
+			CTX_FRAME(10);
+			CTX_FRAME(11);
+			CTX_FRAME(12);
+			CTX_FRAME(13);
+			CTX_FRAME(14);
+			CTX_FRAME(15);
+		default:
+			impossible("", "");
+		}
+		if (frame > (void *)ctx)
+			break;
+		context_ok.path[i] = addr;
+	}
+#undef CTX_FRAME
+}
+
+
+/* initialise context and bind it to the current thread
+
+   This function should be called at the beginning of reiser4 part of
+   syscall.
+*/
+reiser4_internal int
+init_context(reiser4_context * context	/* pointer to the reiser4 context
+					 * being initalised */ ,
+	     struct super_block *super	/* super block we are going to
+					 * work with */)
+{
+	assert("nikita-2662", !in_interrupt() && !in_irq());
+	assert("nikita-3356", context != NULL);
+	assert("nikita-3357", super != NULL);
+	assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
+
+	xmemset(context, 0, sizeof *context);
+
+	if (is_in_reiser4_context()) {
+		reiser4_context *parent;
+
+		parent = (reiser4_context *) current->journal_info;
+		/* NOTE-NIKITA this is dubious */
+		if (parent->super == super) {
+			context->parent = parent;
+#if (REISER4_DEBUG)
+			++context->parent->nr_children;
+#endif
+			return 0;
+		}
+	}
+
+	context->super = super;
+	context->magic = context_magic;
+	context->outer = current->journal_info;
+	current->journal_info = (void *) context;
+
+	init_lock_stack(&context->stack);
+
+	txn_begin(context);
+
+	context->parent = context;
+	tap_list_init(&context->taps);
+#if REISER4_DEBUG
+#if REISER4_DEBUG_CONTEXTS
+	context_list_clean(context);	/* to satisfy assertion */
+	spin_lock(&active_contexts_lock);
+	context_list_check(&active_contexts);
+	context_list_push_front(&active_contexts, context);
+	/*check_contexts();*/
+	spin_unlock(&active_contexts_lock);
+#endif
+	context->task = current;
+#endif
+	grab_space_enable();
+	return 0;
+}
+
+/* cast lock stack embedded into reiser4 context up to its container */
+reiser4_internal reiser4_context *
+get_context_by_lock_stack(lock_stack * owner)
+{
+	return container_of(owner, reiser4_context, stack);
+}
+
+/* true if there is already _any_ reiser4 context for the current thread */
+reiser4_internal int
+is_in_reiser4_context(void)
+{
+	reiser4_context *ctx;
+
+	ctx = current->journal_info;
+	return
+		ctx != NULL &&
+		((unsigned long) ctx->magic) == context_magic;
+}
+
+/*
+ * call balance dirty pages for the current context.
+ *
+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
+ * write---this covers vast majority of all dirty traffic), but we cannot do
+ * this immediately when formatted node is dirtied, because long term lock is
+ * usually held at that time. To work around this, dirtying of formatted node
+ * simply increases ->nr_marked_dirty counter in the current reiser4
+ * context. When we are about to leave this context,
+ * balance_dirty_pages_ratelimited() is called, if necessary.
+ *
+ * This introduces another problem: sometimes we do not want to run
+ * balance_dirty_pages_ratelimited() when leaving a context, for example
+ * because some important lock (like ->i_sem on the parent directory) is
+ * held. To achieve this, ->nobalance flag can be set in the current context.
+ */
+static void
+balance_dirty_pages_at(reiser4_context * context)
+{
+	reiser4_super_info_data * sbinfo = get_super_private(context->super);
+
+	/*
+	 * call balance_dirty_pages_ratelimited() to process formatted nodes
+	 * dirtied during this system call.
+	 */
+	if (context->nr_marked_dirty != 0 &&   /* were any nodes dirtied? */
+	    /* aren't we called early during mount? */
+	    sbinfo->fake &&
+	    /* don't call balance dirty pages from ->writepage(): it's
+	     * deadlock prone */
+	    !(current->flags & PF_MEMALLOC) &&
+	    /* and don't stall pdflush */
+	    !current_is_pdflush())
+		balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
+}
+
+/*
+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
+ * transaction. Call done_context() to do context related book-keeping.
+ */
+reiser4_internal void reiser4_exit_context(reiser4_context * context)
+{
+	assert("nikita-3021", schedulable());
+
+	if (context == context->parent) {
+		if (!context->nobalance) {
+			txn_restart(context);
+			balance_dirty_pages_at(context);
+		}
+		txn_end(context);
+	}
+	done_context(context);
+}
+
+/* release resources associated with context.
+
+   This function should be called at the end of "session" with reiser4,
+   typically just before leaving reiser4 driver back to VFS.
+
+   This is good place to put some degugging consistency checks, like that
+   thread released all locks and closed transcrash etc.
+
+*/
+reiser4_internal void
+done_context(reiser4_context * context /* context being released */)
+{
+	reiser4_context *parent;
+	assert("nikita-860", context != NULL);
+
+	parent = context->parent;
+	assert("nikita-2174", parent != NULL);
+	assert("nikita-2093", parent == parent->parent);
+	assert("nikita-859", parent->magic == context_magic);
+	assert("vs-646", (reiser4_context *) current->journal_info == parent);
+	assert("zam-686", !in_interrupt() && !in_irq());
+
+	/* only do anything when leaving top-level reiser4 context. All nested
+	 * contexts are just dummies. */
+	if (parent == context) {
+		assert("jmacd-673", parent->trans == NULL);
+		assert("jmacd-1002", lock_stack_isclean(&parent->stack));
+		assert("nikita-1936", no_counters_are_held());
+		assert("nikita-3403", !delayed_inode_updates(context->dirty));
+		assert("nikita-2626", tap_list_empty(taps_list()));
+		assert("zam-1004", get_super_private(context->super)->delete_sema_owner != current);
+
+		/* release all grabbed but as yet unused blocks */
+		if (context->grabbed_blocks != 0)
+			all_grabbed2free();
+
+		/*
+		 * synchronize against longterm_unlock_znode():
+		 * wake_up_requestor() wakes up requestors without holding
+		 * zlock (otherwise they will immediately bump into that lock
+		 * after wake up on another CPU). To work around (rare)
+		 * situation where requestor has been woken up asynchronously
+		 * and managed to run until completion (and destroy its
+		 * context and lock stack) before wake_up_requestor() called
+		 * wake_up() on it, wake_up_requestor() synchronize on lock
+		 * stack spin lock. It has actually been observed that spin
+		 * lock _was_ locked at this point, because
+		 * wake_up_requestor() took interrupt.
+		 */
+		spin_lock_stack(&context->stack);
+		spin_unlock_stack(&context->stack);
+
+#if REISER4_DEBUG_CONTEXTS
+		/* remove from active contexts */
+		spin_lock(&active_contexts_lock);
+		/*check_contexts();*/
+		context_list_remove(parent);
+		spin_unlock(&active_contexts_lock);
+#endif
+		assert("zam-684", context->nr_children == 0);
+		/* restore original ->fs_context value */
+		current->journal_info = context->outer;
+	} else {
+#if REISER4_DEBUG
+		parent->nr_children--;
+		assert("zam-685", parent->nr_children >= 0);
+#endif
+	}
+}
+
+/* Initialize list of all contexts */
+reiser4_internal int
+init_context_mgr(void)
+{
+#if REISER4_DEBUG_CONTEXTS
+	spin_lock_init(&active_contexts_lock);
+	context_list_init(&active_contexts);
+#endif
+	return 0;
+}
+
+#if REISER4_DEBUG_OUTPUT
+/* debugging function: output reiser4 context contexts in the human readable
+ * form  */
+reiser4_internal void
+print_context(const char *prefix, reiser4_context * context)
+{
+	if (context == NULL) {
+		printk("%s: null context\n", prefix);
+		return;
+	}
+#if REISER4_TRACE
+	printk("%s: trace_flags: %x\n", prefix, context->trace_flags);
+#endif
+	print_lock_counters("\tlocks", &context->locks);
+#if REISER4_DEBUG
+	printk("pid: %i, comm: %s\n", context->task->pid, context->task->comm);
+#endif
+	print_lock_stack("\tlock stack", &context->stack);
+	info_atom("\tatom", context->trans_in_ctx.atom);
+}
+
+#if REISER4_DEBUG_CONTEXTS
+/* debugging: dump contents of all active contexts */
+void
+print_contexts(void)
+{
+	reiser4_context *context;
+
+	spin_lock(&active_contexts_lock);
+
+	for_all_type_safe_list(context, &active_contexts, context) {
+		print_context("context", context);
+	}
+
+	spin_unlock(&active_contexts_lock);
+}
+#endif
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/context.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/context.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/context.h	2004-08-22 19:35:33.619653016 +1000
@@ -0,0 +1,315 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Reiser4 context. See context.c for details. */
+
+#if !defined( __REISER4_CONTEXT_H__ )
+#define __REISER4_CONTEXT_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "spin_macros.h"
+#include "dformat.h"
+#include "type_safe_list.h"
+#include "tap.h"
+#include "lock.h"
+
+#include <linux/types.h>	/* for __u??  */
+#include <linux/fs.h>		/* for struct super_block  */
+#include <linux/spinlock.h>
+#include <linux/sched.h>	/* for struct task_struct */
+
+/* list of active lock stacks */
+#if REISER4_DEBUG_CONTEXTS
+TYPE_SAFE_LIST_DECLARE(context);
+#endif
+
+ON_DEBUG(TYPE_SAFE_LIST_DECLARE(flushers);)
+
+#if REISER4_DEBUG
+
+/*
+ * Stat-data update tracking.
+ *
+ * Some reiser4 functions (reiser4_{del,add}_nlink() take an additional
+ * parameter indicating whether stat-data update should be performed. This is
+ * because sometimes fields of the same inode are modified several times
+ * during single system and updating stat-data (which implies tree lookup and,
+ * sometimes, tree balancing) on each inode modification is too expensive. To
+ * avoid unnecessary stat-data updates, we pass flag to not update it during
+ * inode field updates, and update it manually at the end of the system call.
+ *
+ * This introduces a possibility of "missed stat data update" when final
+ * stat-data update is not performed in some code path. To detect and track
+ * down such situations following code was developed.
+ *
+ * dirty_inode_info is an array of slots. Each slot keeps information about
+ * "delayed stat data update", that is about a call to a function modifying
+ * inode field that was instructed to not update stat data. Direct call to
+ * reiser4_update_sd() clears corresponding slot. On leaving reiser4 context
+ * all slots are scanned and information about still not forced updates is
+ * printed.
+ */
+
+/* how many delayed stat data update slots to remember */
+#define TRACKED_DELAYED_UPDATE (0)
+
+typedef struct {
+	ino_t ino;      /* inode number of object with delayed stat data
+			 * update */
+	int   delayed;  /* 1 if update is delayed, 0 if update for forced */
+	void *stack[4]; /* stack back-trace of the call chain where update was
+			 * delayed */
+} dirty_inode_info[TRACKED_DELAYED_UPDATE];
+
+extern void mark_inode_update(struct inode *object, int immediate);
+extern int  delayed_inode_updates(dirty_inode_info info);
+
+#else
+
+typedef struct {} dirty_inode_info;
+
+#define mark_inode_update(object, immediate) noop
+#define delayed_inode_updates(info) noop
+
+#endif
+
+/* reiser4 per-thread context */
+struct reiser4_context {
+	/* magic constant. For identification of reiser4 contexts. */
+	__u32 magic;
+
+	/* current lock stack. See lock.[ch]. This is where list of all
+	   locks taken by current thread is kept. This is also used in
+	   deadlock detection. */
+	lock_stack stack;
+
+	/* current transcrash. */
+	txn_handle *trans;
+	/* transaction handle embedded into reiser4_context. ->trans points
+	 * here by default. */
+	txn_handle trans_in_ctx;
+
+	/* super block we are working with.  To get the current tree
+	   use &get_super_private (reiser4_get_current_sb ())->tree. */
+	struct super_block *super;
+
+	/* parent fs activation */
+	struct fs_activation *outer;
+
+	/* per-thread grabbed (for further allocation) blocks counter */
+	reiser4_block_nr grabbed_blocks;
+
+	/* parent context */
+	reiser4_context *parent;
+
+	/* list of taps currently monitored. See tap.c */
+	tap_list_head taps;
+
+	/* grabbing space is enabled */
+	int grab_enabled  :1;
+    	/* should be set when we are write dirty nodes to disk in jnode_flush or
+	 * reiser4_write_logs() */
+	int writeout_mode :1;
+	/* true, if current thread is an ent thread */
+	int entd          :1;
+	/* true, if balance_dirty_pages() should not be run when leaving this
+	 * context. This is used to avoid lengthly balance_dirty_pages()
+	 * operation when holding some important resource, like directory
+	 * ->i_sem */
+	int nobalance     :1;
+
+	/* count non-trivial jnode_set_dirty() calls */
+	unsigned long nr_marked_dirty;
+#if REISER4_DEBUG
+	/* A link of all active contexts. */
+	context_list_link contexts_link;
+	/* debugging information about reiser4 locks held by the current
+	 * thread */
+	lock_counters_info locks;
+	int nr_children;	/* number of child contexts */
+	struct task_struct *task; /* so we can easily find owner of the stack */
+
+	/*
+	 * disk space grabbing debugging support
+	 */
+	/* how many disk blocks were grabbed by the first call to
+	 * reiser4_grab_space() in this context */
+	reiser4_block_nr grabbed_initially;
+	/* stack back-trace of the first call to reiser4_grab_space() in this
+	 * context */
+	backtrace_path   grabbed_at;
+
+	/* list of all threads doing flush currently */
+	flushers_list_link  flushers_link;
+	/* information about last error encountered by reiser4 */
+	err_site err;
+	/* information about delayed stat data updates. See above. */
+	dirty_inode_info dirty;
+#endif
+
+#if REISER4_TRACE
+	/* per-thread tracing flags. Use reiser4_trace_flags enum to set
+	   bits in it. */
+	__u32 trace_flags;
+#endif
+#if REISER4_DEBUG_NODE
+	/*
+	 * don't perform node consistency checks while this is greater than
+	 * zero. Used during operations that temporary violate node
+	 * consistency.
+	 */
+	int disable_node_check;
+#endif
+};
+
+#if REISER4_DEBUG_CONTEXTS
+TYPE_SAFE_LIST_DEFINE(context, reiser4_context, contexts_link);
+#endif
+#if REISER4_DEBUG
+TYPE_SAFE_LIST_DEFINE(flushers, reiser4_context, flushers_link);
+#endif
+
+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
+
+/* Debugging helps. */
+extern int init_context_mgr(void);
+#if REISER4_DEBUG_OUTPUT
+extern void print_context(const char *prefix, reiser4_context * ctx);
+#else
+#define print_context(p,c) noop
+#endif
+
+#if REISER4_DEBUG_CONTEXTS && REISER4_DEBUG_OUTPUT
+extern void print_contexts(void);
+#else
+#define print_contexts() noop
+#endif
+
+#if REISER4_DEBUG_CONTEXTS
+extern void check_contexts(void);
+#else
+#define check_contexts() noop
+#endif
+
+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
+#define current_blocksize reiser4_get_current_sb()->s_blocksize
+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
+
+extern int init_context(reiser4_context * context, struct super_block *super);
+extern void done_context(reiser4_context * context);
+
+/* magic constant we store in reiser4_context allocated at the stack. Used to
+   catch accesses to staled or uninitialized contexts. */
+#define context_magic ((__u32) 0x4b1b5d0b)
+
+extern int is_in_reiser4_context(void);
+
+/* return context associated with given thread */
+
+void get_context_ok(reiser4_context *);
+
+/*
+ * return reiser4_context for the thread @tsk
+ */
+static inline reiser4_context *
+get_context(const struct task_struct *tsk)
+{
+	assert("vs-1682", ((reiser4_context *) tsk->journal_info)->magic == context_magic);
+	return (reiser4_context *) tsk->journal_info;
+}
+
+/*
+ * return reiser4 context of the current thread, or NULL if there is none.
+ */
+static inline reiser4_context *
+get_current_context_check(void)
+{
+	if (is_in_reiser4_context())
+		return get_context(current);
+	else
+		return NULL;
+}
+
+static inline reiser4_context * get_current_context(void);/* __attribute__((const));*/
+
+/* return context associated with current thread */
+static inline reiser4_context *
+get_current_context(void)
+{
+	return get_context(current);
+}
+
+/*
+ * true if current thread is in the write-out mode. Thread enters write-out
+ * mode during jnode_flush and reiser4_write_logs().
+ */
+static inline int is_writeout_mode(void)
+{
+	return get_current_context()->writeout_mode;
+}
+
+/*
+ * enter write-out mode
+ */
+static inline void writeout_mode_enable(void)
+{
+	assert("zam-941", !get_current_context()->writeout_mode);
+	get_current_context()->writeout_mode = 1;
+}
+
+/*
+ * leave write-out mode
+ */
+static inline void writeout_mode_disable(void)
+{
+	assert("zam-942", get_current_context()->writeout_mode);
+	get_current_context()->writeout_mode = 0;
+}
+
+static inline void grab_space_enable(void)
+{
+	get_current_context()->grab_enabled = 1;
+}
+
+static inline void grab_space_disable(void)
+{
+	get_current_context()->grab_enabled = 0;
+}
+
+static inline void grab_space_set_enabled (int enabled)
+{
+	get_current_context()->grab_enabled = enabled;
+}
+
+static inline int is_grab_enabled(reiser4_context *ctx)
+{
+	return ctx->grab_enabled;
+}
+
+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
+ * flush would be performed when it is closed. This is necessary when handle
+ * has to be closed under some coarse semaphore, like i_sem of
+ * directory. Commit will be performed by ktxnmgrd. */
+static inline void context_set_commit_async(reiser4_context * context)
+{
+	context = context->parent;
+	context->nobalance = 1;
+	context->trans->flags |= TXNH_DONT_COMMIT;
+}
+
+extern void reiser4_exit_context(reiser4_context * context);
+
+/* __REISER4_CONTEXT_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/coord.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/coord.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/coord.c	2004-08-22 19:35:33.620652857 +1000
@@ -0,0 +1,1003 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "tree.h"
+#include "plugin/item/item.h"
+#include "znode.h"
+#include "coord.h"
+
+/* Internal constructor. */
+static inline void
+coord_init_values(coord_t *coord, const znode *node, pos_in_node_t item_pos,
+		  pos_in_node_t unit_pos, between_enum between)
+{
+	coord->node = (znode *) node;
+	coord_set_item_pos(coord, item_pos);
+	coord->unit_pos = unit_pos;
+	coord->between = between;
+	ON_DEBUG(coord->plug_v = 0);
+	ON_DEBUG(coord->body_v = 0);
+
+	/*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
+}
+
+/* after shifting of node content, coord previously set properly may become
+   invalid, try to "normalize" it. */
+reiser4_internal void
+coord_normalize(coord_t *coord)
+{
+	znode *node;
+
+	node = coord->node;
+	assert("vs-683", node);
+
+	coord_clear_iplug(coord);
+
+	if (node_is_empty(node)) {
+		coord_init_first_unit(coord, node);
+	} else if ((coord->between == AFTER_ITEM) || (coord->between == AFTER_UNIT)) {
+		return;
+	} else if (coord->item_pos == coord_num_items(coord) && coord->between == BEFORE_ITEM) {
+		coord_dec_item_pos(coord);
+		coord->between = AFTER_ITEM;
+	} else if (coord->unit_pos == coord_num_units(coord) && coord->between == BEFORE_UNIT) {
+		coord->unit_pos--;
+		coord->between = AFTER_UNIT;
+	} else if (coord->item_pos == coord_num_items(coord) && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
+		coord_dec_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AFTER_ITEM;
+	}
+}
+
+/* Copy a coordinate. */
+reiser4_internal void
+coord_dup(coord_t * coord, const coord_t * old_coord)
+{
+	assert("jmacd-9800", coord_check(old_coord));
+	coord_dup_nocheck(coord, old_coord);
+}
+
+/* Copy a coordinate without check. Useful when old_coord->node is not
+   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
+reiser4_internal void
+coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
+{
+	coord->node = old_coord->node;
+	coord_set_item_pos(coord, old_coord->item_pos);
+	coord->unit_pos = old_coord->unit_pos;
+	coord->between = old_coord->between;
+	coord->iplugid = old_coord->iplugid;
+	ON_DEBUG(coord->plug_v = old_coord->plug_v);
+	ON_DEBUG(coord->body_v = old_coord->body_v);
+}
+
+/* Initialize an invalid coordinate. */
+reiser4_internal void
+coord_init_invalid(coord_t * coord, const znode * node)
+{
+	coord_init_values(coord, node, 0, 0, INVALID_COORD);
+}
+
+reiser4_internal void
+coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
+{
+	coord_init_values(coord, node, 0, 0, AT_UNIT);
+}
+
+/* Initialize a coordinate to point at the first unit of the first item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+reiser4_internal void
+coord_init_first_unit(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
+
+	assert("jmacd-9801", coord_check(coord));
+}
+
+/* Initialize a coordinate to point at the last unit of the last item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+reiser4_internal void
+coord_init_last_unit(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node, (is_empty ? 0 : node_num_items(node) - 1), 0, (is_empty ? EMPTY_NODE : AT_UNIT));
+	if (!is_empty)
+		coord->unit_pos = coord_last_unit_pos(coord);
+	assert("jmacd-9802", coord_check(coord));
+}
+
+/* Initialize a coordinate to before the first item.  If the node is empty, it is
+   positioned at the EMPTY_NODE. */
+reiser4_internal void
+coord_init_before_first_item(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : BEFORE_UNIT));
+
+	assert("jmacd-9803", coord_check(coord));
+}
+
+/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
+   at the EMPTY_NODE. */
+reiser4_internal void
+coord_init_after_last_item(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node,
+			  (is_empty ? 0 : node_num_items(node) - 1), 0, (is_empty ? EMPTY_NODE : AFTER_ITEM));
+
+	assert("jmacd-9804", coord_check(coord));
+}
+
+/* Initialize a coordinate to after last unit in the item. Coord must be set
+   already to existing item */
+reiser4_internal void
+coord_init_after_item_end(coord_t * coord)
+{
+	coord->between = AFTER_UNIT;
+	coord->unit_pos = coord_last_unit_pos(coord);
+}
+
+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
+reiser4_internal void
+coord_init_before_item(coord_t * coord)
+{
+	coord->unit_pos = 0;
+	coord->between = BEFORE_ITEM;
+}
+
+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
+reiser4_internal void
+coord_init_after_item(coord_t * coord)
+{
+	coord->unit_pos = 0;
+	coord->between = AFTER_ITEM;
+}
+
+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
+   it was not clear how actually */
+reiser4_internal void
+coord_init_zero(coord_t * coord)
+{
+	xmemset(coord, 0, sizeof (*coord));
+}
+
+/* Return the number of units at the present item.  Asserts coord_is_existing_item(). */
+reiser4_internal unsigned
+coord_num_units(const coord_t * coord)
+{
+	assert("jmacd-9806", coord_is_existing_item(coord));
+
+	return item_plugin_by_coord(coord)->b.nr_units(coord);
+}
+
+/* Returns true if the coord was initializewd by coord_init_invalid (). */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_is_invalid(const coord_t * coord)
+{
+	return coord->between == INVALID_COORD;
+}
+
+/* Returns true if the coordinate is positioned at an existing item, not before or after
+   an item.  It may be placed at, before, or after any unit within the item, whether
+   existing or not. */
+reiser4_internal int
+coord_is_existing_item(const coord_t * coord)
+{
+	switch (coord->between) {
+	case EMPTY_NODE:
+	case BEFORE_ITEM:
+	case AFTER_ITEM:
+	case INVALID_COORD:
+		return 0;
+
+	case BEFORE_UNIT:
+	case AT_UNIT:
+	case AFTER_UNIT:
+		return coord->item_pos < coord_num_items(coord);
+	}
+
+	IF_TRACE(TRACE_COORDS, print_coord("unreachable", coord, 0));
+	impossible("jmacd-9900", "unreachable coord: %p", coord);
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
+   unit. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_is_existing_unit(const coord_t * coord)
+{
+	switch (coord->between) {
+	case EMPTY_NODE:
+	case BEFORE_UNIT:
+	case AFTER_UNIT:
+	case BEFORE_ITEM:
+	case AFTER_ITEM:
+	case INVALID_COORD:
+		return 0;
+
+	case AT_UNIT:
+		return (coord->item_pos < coord_num_items(coord) && coord->unit_pos < coord_num_units(coord));
+	}
+
+	impossible("jmacd-9902", "unreachable");
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
+   true for empty nodes nor coordinates positioned before the first item. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_is_leftmost_unit(const coord_t * coord)
+{
+	return (coord->between == AT_UNIT && coord->item_pos == 0 && coord->unit_pos == 0);
+}
+
+#if REISER4_DEBUG
+/* For assertions only, checks for a valid coordinate. */
+int
+coord_check(const coord_t * coord)
+{
+	if (coord->node == NULL) {
+		return 0;
+	}
+	if (znode_above_root(coord->node))
+		return 1;
+
+	switch (coord->between) {
+	default:
+	case INVALID_COORD:
+		return 0;
+	case EMPTY_NODE:
+		if (!node_is_empty(coord->node)) {
+			return 0;
+		}
+		return coord->item_pos == 0 && coord->unit_pos == 0;
+
+	case BEFORE_UNIT:
+	case AFTER_UNIT:
+		if (node_is_empty(coord->node) && (coord->item_pos == 0) && (coord->unit_pos == 0))
+			return 1;
+	case AT_UNIT:
+		break;
+	case AFTER_ITEM:
+	case BEFORE_ITEM:
+		/* before/after item should not set unit_pos. */
+		if (coord->unit_pos != 0) {
+			return 0;
+		}
+		break;
+	}
+
+	if (coord->item_pos >= node_num_items(coord->node)) {
+		return 0;
+	}
+
+	/* FIXME-VS: we are going to check unit_pos. This makes no sense when
+	   between is set either AFTER_ITEM or BEFORE_ITEM */
+	if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
+		return 1;
+
+	if (coord_is_iplug_set(coord) &&
+	    coord->unit_pos > item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
+		return 0;
+	}
+	return 1;
+}
+#endif
+
+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
+   Returns 1 if the new position is does not exist. */
+static int
+coord_adjust_items(coord_t * coord, unsigned items, int is_next)
+{
+	/* If the node is invalid, leave it. */
+	if (coord->between == INVALID_COORD) {
+		return 1;
+	}
+
+	/* If the node is empty, set it appropriately. */
+	if (items == 0) {
+		coord->between = EMPTY_NODE;
+		coord_set_item_pos(coord, 0);
+		coord->unit_pos = 0;
+		return 1;
+	}
+
+	/* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
+	if (coord->between == EMPTY_NODE) {
+		coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
+		coord_set_item_pos(coord, 0);
+		coord->unit_pos = 0;
+		return 0;
+	}
+
+	/* If the item_pos is out-of-range, set it appropriatly. */
+	if (coord->item_pos >= items) {
+		coord->between = AFTER_ITEM;
+		coord_set_item_pos(coord, items - 1);
+		coord->unit_pos = 0;
+		/* If is_next, return 1 (can't go any further). */
+		return is_next;
+	}
+
+	return 0;
+}
+
+/* Advances the coordinate by one unit to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is an
+   existing unit. */
+reiser4_internal int
+coord_next_unit(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 1) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case BEFORE_UNIT:
+		/* Now it is positioned at the same unit. */
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_UNIT:
+	case AT_UNIT:
+		/* If it was at or after a unit and there are more units in this item,
+		   advance to the next one. */
+		if (coord->unit_pos < coord_last_unit_pos(coord)) {
+			coord->unit_pos += 1;
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+		/* Otherwise, it is crossing an item boundary and treated as if it was
+		   after the current item. */
+		coord->between = AFTER_ITEM;
+		coord->unit_pos = 0;
+		/* FALLTHROUGH */
+
+	case AFTER_ITEM:
+		/* Check for end-of-node. */
+		if (coord->item_pos == items - 1) {
+			return 1;
+		}
+
+		coord_inc_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case BEFORE_ITEM:
+		/* The adjust_items checks ensure that we are valid here. */
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		/* Handled in coord_adjust_items(). */
+		break;
+	}
+
+	impossible("jmacd-9902", "unreachable");
+	return 0;
+}
+
+/* Advances the coordinate by one item to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
+   an existing item. */
+reiser4_internal int
+coord_next_item(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 1) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AFTER_UNIT:
+	case AT_UNIT:
+	case BEFORE_UNIT:
+	case AFTER_ITEM:
+		/* Check for end-of-node. */
+		if (coord->item_pos == items - 1) {
+			coord->between = AFTER_ITEM;
+			coord->unit_pos = 0;
+			coord_clear_iplug(coord);
+			return 1;
+		}
+
+		/* Anywhere in an item, go to the next one. */
+		coord->between = AT_UNIT;
+		coord_inc_item_pos(coord);
+		coord->unit_pos = 0;
+		return 0;
+
+	case BEFORE_ITEM:
+		/* The out-of-range check ensures that we are valid here. */
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		/* Handled in coord_adjust_items(). */
+		break;
+	}
+
+	impossible("jmacd-9903", "unreachable");
+	return 0;
+}
+
+/* Advances the coordinate by one unit to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing unit. */
+reiser4_internal int
+coord_prev_unit(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 0) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+	case BEFORE_UNIT:
+		if (coord->unit_pos > 0) {
+			coord->unit_pos -= 1;
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+		if (coord->item_pos == 0) {
+			coord->between = BEFORE_ITEM;
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		coord->unit_pos = coord_last_unit_pos(coord);
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_UNIT:
+		/* What if unit_pos is out-of-range? */
+		assert("jmacd-5442", coord->unit_pos <= coord_last_unit_pos(coord));
+		coord->between = AT_UNIT;
+		return 0;
+
+	case BEFORE_ITEM:
+		if (coord->item_pos == 0) {
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		/* FALLTHROUGH */
+
+	case AFTER_ITEM:
+		coord->between = AT_UNIT;
+		coord->unit_pos = coord_last_unit_pos(coord);
+		return 0;
+
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		break;
+	}
+
+	impossible("jmacd-9904", "unreachable");
+	return 0;
+}
+
+/* Advances the coordinate by one item to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing item. */
+reiser4_internal int
+coord_prev_item(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 0) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+	case AFTER_UNIT:
+	case BEFORE_UNIT:
+	case BEFORE_ITEM:
+
+		if (coord->item_pos == 0) {
+			coord->between = BEFORE_ITEM;
+			coord->unit_pos = 0;
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_ITEM:
+		coord->between = AT_UNIT;
+		coord->unit_pos = 0;
+		return 0;
+
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		break;
+	}
+
+	impossible("jmacd-9905", "unreachable");
+	return 0;
+}
+
+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
+reiser4_internal void
+coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
+{
+	assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+	if (dir == LEFT_SIDE) {
+		coord_init_first_unit(coord, node);
+	} else {
+		coord_init_last_unit(coord, node);
+	}
+}
+
+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
+   argument. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_is_after_sideof_unit(coord_t * coord, sideof dir)
+{
+	assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+	if (dir == LEFT_SIDE) {
+		return coord_is_before_leftmost(coord);
+	} else {
+		return coord_is_after_rightmost(coord);
+	}
+}
+
+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_sideof_unit(coord_t * coord, sideof dir)
+{
+	assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+	if (dir == LEFT_SIDE) {
+		return coord_prev_unit(coord);
+	} else {
+		return coord_next_unit(coord);
+	}
+}
+
+#if REISER4_DEBUG
+#define DEBUG_COORD_FIELDS (sizeof(c1->plug_v) + sizeof(c1->body_v))
+#else
+#define DEBUG_COORD_FIELDS (0)
+#endif
+
+reiser4_internal int
+coords_equal(const coord_t * c1, const coord_t * c2)
+{
+	assert("nikita-2840", c1 != NULL);
+	assert("nikita-2841", c2 != NULL);
+
+#if 0
+	/* assertion to track changes in coord_t */
+	cassert(sizeof(*c1) == sizeof(c1->node) +
+		sizeof(c1->item_pos) +
+		sizeof(c1->unit_pos) +
+		sizeof(c1->iplugid) +
+		sizeof(c1->between) +
+		sizeof(c1->pad) +
+		sizeof(c1->offset) +
+		DEBUG_COORD_FIELDS);
+#endif
+	return
+		c1->node == c2->node &&
+		c1->item_pos == c2->item_pos &&
+		c1->unit_pos == c2->unit_pos &&
+		c1->between == c2->between;
+}
+
+/* Returns true if two coordinates are consider equal.  Coordinates that are between units
+   or items are considered equal. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_eq(const coord_t * c1, const coord_t * c2)
+{
+	assert("nikita-1807", c1 != NULL);
+	assert("nikita-1808", c2 != NULL);
+
+	if (coords_equal(c1, c2)) {
+		return 1;
+	}
+	if (c1->node != c2->node) {
+		return 0;
+	}
+
+	switch (c1->between) {
+	case INVALID_COORD:
+	case EMPTY_NODE:
+	case AT_UNIT:
+		return 0;
+
+	case BEFORE_UNIT:
+		/* c2 must be after the previous unit. */
+		return (c1->item_pos == c2->item_pos && c2->between == AFTER_UNIT && c2->unit_pos == c1->unit_pos - 1);
+
+	case AFTER_UNIT:
+		/* c2 must be before the next unit. */
+		return (c1->item_pos == c2->item_pos && c2->between == BEFORE_UNIT && c2->unit_pos == c1->unit_pos + 1);
+
+	case BEFORE_ITEM:
+		/* c2 must be after the previous item. */
+		return (c1->item_pos == c2->item_pos - 1 && c2->between == AFTER_ITEM);
+
+	case AFTER_ITEM:
+		/* c2 must be before the next item. */
+		return (c1->item_pos == c2->item_pos + 1 && c2->between == BEFORE_ITEM);
+	}
+
+	impossible("jmacd-9906", "unreachable");
+	return 0;
+}
+
+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
+   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal coord_wrt_node coord_wrt(const coord_t * coord)
+{
+	if (coord_is_before_leftmost(coord)) {
+		return COORD_ON_THE_LEFT;
+	}
+
+	if (coord_is_after_rightmost(coord)) {
+		return COORD_ON_THE_RIGHT;
+	}
+
+	return COORD_INSIDE;
+}
+
+/* Returns true if the coordinate is positioned after the last item or after the last unit
+   of the last item or it is an empty node. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_is_after_rightmost(const coord_t * coord)
+{
+	assert("jmacd-7313", coord_check(coord));
+
+	switch (coord->between) {
+	case INVALID_COORD:
+	case AT_UNIT:
+	case BEFORE_UNIT:
+	case BEFORE_ITEM:
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case AFTER_ITEM:
+		return (coord->item_pos == node_num_items(coord->node) - 1);
+
+	case AFTER_UNIT:
+		return ((coord->item_pos == node_num_items(coord->node) - 1) &&
+			coord->unit_pos == coord_last_unit_pos(coord));
+	}
+
+	impossible("jmacd-9908", "unreachable");
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned before the first item or it is an empty
+   node. */
+reiser4_internal int
+coord_is_before_leftmost(const coord_t * coord)
+{
+	/* FIXME-VS: coord_check requires node to be loaded whereas it is not
+	   necessary to check if coord is set before leftmost
+	   assert ("jmacd-7313", coord_check (coord)); */
+	switch (coord->between) {
+	case INVALID_COORD:
+	case AT_UNIT:
+	case AFTER_ITEM:
+	case AFTER_UNIT:
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case BEFORE_ITEM:
+	case BEFORE_UNIT:
+		return (coord->item_pos == 0) && (coord->unit_pos == 0);
+	}
+
+	impossible("jmacd-9908", "unreachable");
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned after a item, before a item, after the
+   last unit of an item, before the first unit of an item, or at an empty node. */
+/* Audited by: green(2002.06.15) */
+reiser4_internal int
+coord_is_between_items(const coord_t * coord)
+{
+	assert("jmacd-7313", coord_check(coord));
+
+	switch (coord->between) {
+	case INVALID_COORD:
+	case AT_UNIT:
+		return 0;
+
+	case AFTER_ITEM:
+	case BEFORE_ITEM:
+	case EMPTY_NODE:
+		return 1;
+
+	case BEFORE_UNIT:
+		return coord->unit_pos == 0;
+
+	case AFTER_UNIT:
+		return coord->unit_pos == coord_last_unit_pos(coord);
+	}
+
+	impossible("jmacd-9908", "unreachable");
+	return 0;
+}
+
+/* Returns true if the coordinates are positioned at adjacent units, regardless of
+   before-after or item boundaries. */
+reiser4_internal int
+coord_are_neighbors(coord_t * c1, coord_t * c2)
+{
+	coord_t *left;
+	coord_t *right;
+
+	assert("nikita-1241", c1 != NULL);
+	assert("nikita-1242", c2 != NULL);
+	assert("nikita-1243", c1->node == c2->node);
+	assert("nikita-1244", coord_is_existing_unit(c1));
+	assert("nikita-1245", coord_is_existing_unit(c2));
+
+	left = right = 0;
+	switch (coord_compare(c1, c2)) {
+	case COORD_CMP_ON_LEFT:
+		left = c1;
+		right = c2;
+		break;
+	case COORD_CMP_ON_RIGHT:
+		left = c2;
+		right = c1;
+		break;
+	case COORD_CMP_SAME:
+		return 0;
+	default:
+		wrong_return_value("nikita-1246", "compare_coords()");
+	}
+	assert("vs-731", left && right);
+	if (left->item_pos == right->item_pos) {
+		return left->unit_pos + 1 == right->unit_pos;
+	} else if (left->item_pos + 1 == right->item_pos) {
+		return (left->unit_pos == coord_last_unit_pos(left)) && (right->unit_pos == 0);
+	} else {
+		return 0;
+	}
+}
+
+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
+   COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2.  */
+/* Audited by: green(2002.06.15) */
+reiser4_internal coord_cmp coord_compare(coord_t * c1, coord_t * c2)
+{
+	assert("vs-209", c1->node == c2->node);
+	assert("vs-194", coord_is_existing_unit(c1)
+	       && coord_is_existing_unit(c2));
+
+	if (c1->item_pos > c2->item_pos)
+		return COORD_CMP_ON_RIGHT;
+	if (c1->item_pos < c2->item_pos)
+		return COORD_CMP_ON_LEFT;
+	if (c1->unit_pos > c2->unit_pos)
+		return COORD_CMP_ON_RIGHT;
+	if (c1->unit_pos < c2->unit_pos)
+		return COORD_CMP_ON_LEFT;
+	return COORD_CMP_SAME;
+}
+
+/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
+   non-zero if there is no position to the right. */
+reiser4_internal int
+coord_set_to_right(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 1) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+		return 0;
+
+	case BEFORE_ITEM:
+	case BEFORE_UNIT:
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_UNIT:
+		if (coord->unit_pos < coord_last_unit_pos(coord)) {
+			coord->unit_pos += 1;
+			coord->between = AT_UNIT;
+			return 0;
+		} else {
+
+			coord->unit_pos = 0;
+
+			if (coord->item_pos == items - 1) {
+				coord->between = AFTER_ITEM;
+				return 1;
+			}
+
+			coord_inc_item_pos(coord);
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+	case AFTER_ITEM:
+		if (coord->item_pos == items - 1) {
+			return 1;
+		}
+
+		coord_inc_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case INVALID_COORD:
+		break;
+	}
+
+	impossible("jmacd-9920", "unreachable");
+	return 0;
+}
+
+/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
+   non-zero if there is no position to the left. */
+reiser4_internal int
+coord_set_to_left(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 0) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+		return 0;
+
+	case AFTER_UNIT:
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_ITEM:
+		coord->between = AT_UNIT;
+		coord->unit_pos = coord_last_unit_pos(coord);
+		return 0;
+
+	case BEFORE_UNIT:
+		if (coord->unit_pos > 0) {
+			coord->unit_pos -= 1;
+			coord->between = AT_UNIT;
+			return 0;
+		} else {
+
+			if (coord->item_pos == 0) {
+				coord->between = BEFORE_ITEM;
+				return 1;
+			}
+
+			coord->unit_pos = coord_last_unit_pos(coord);
+			coord_dec_item_pos(coord);
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+	case BEFORE_ITEM:
+		if (coord->item_pos == 0) {
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		coord->unit_pos = coord_last_unit_pos(coord);
+		coord->between = AT_UNIT;
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case INVALID_COORD:
+		break;
+	}
+
+	impossible("jmacd-9920", "unreachable");
+	return 0;
+}
+
+reiser4_internal const char *
+coord_tween_tostring(between_enum n)
+{
+	switch (n) {
+	case BEFORE_UNIT:
+		return "before unit";
+	case BEFORE_ITEM:
+		return "before item";
+	case AT_UNIT:
+		return "at unit";
+	case AFTER_UNIT:
+		return "after unit";
+	case AFTER_ITEM:
+		return "after item";
+	case EMPTY_NODE:
+		return "empty node";
+	case INVALID_COORD:
+		return "invalid";
+	default:{
+			static char buf[30];
+
+			sprintf(buf, "unknown: %i", n);
+			return buf;
+		}
+	}
+}
+
+reiser4_internal void
+print_coord(const char *mes, const coord_t * coord, int node)
+{
+	if (coord == NULL) {
+		printk("%s: null\n", mes);
+		return;
+	}
+	printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
+	       mes, coord->item_pos, coord->unit_pos, coord_tween_tostring(coord->between), coord->iplugid);
+	if (node)
+		print_znode("\tnode", coord->node);
+}
+
+reiser4_internal int
+item_utmost_child_real_block(const coord_t * coord, sideof side, reiser4_block_nr * blk)
+{
+	return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord, side, blk);
+}
+
+reiser4_internal int
+item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
+{
+	return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
+}
+
+/*
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/coord.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/coord.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/coord.h	2004-08-22 19:35:33.621652697 +1000
@@ -0,0 +1,341 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Coords */
+
+#if !defined( __REISER4_COORD_H__ )
+#define __REISER4_COORD_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+
+/* insertions happen between coords in the tree, so we need some means
+   of specifying the sense of betweenness. */
+typedef enum {
+	BEFORE_UNIT,		/* Note: we/init_coord depends on this value being zero. */
+	AT_UNIT,
+	AFTER_UNIT,
+	BEFORE_ITEM,
+	AFTER_ITEM,
+	INVALID_COORD,
+	EMPTY_NODE,
+} between_enum;
+
+/* location of coord w.r.t. its node */
+typedef enum {
+	COORD_ON_THE_LEFT = -1,
+	COORD_ON_THE_RIGHT = +1,
+	COORD_INSIDE = 0
+} coord_wrt_node;
+
+typedef enum {
+	COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
+} coord_cmp;
+
+struct coord {
+	/* node in a tree */
+	/*  0 */ znode *node;
+
+	/* position of item within node */
+	/*  4 */ pos_in_node_t item_pos;
+	/* position of unit within item */
+	/*  6 */ pos_in_node_t unit_pos;
+	/* optimization: plugin of item is stored in coord_t. Until this was
+	   implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
+	   is invalidated (set to 0xff) on each modification of ->item_pos,
+	   and all such modifications are funneled through coord_*_item_pos()
+	   functions below.
+	*/
+	/*  8 */ char iplugid;
+	/* position of coord w.r.t. to neighboring items and/or units.
+	   Values are taken from &between_enum above.
+	*/
+	/*  9 */ char between;
+	/* padding. It will be added by the compiler anyway to conform to the
+	 * C language alignment requirements. We keep it here to be on the
+	 * safe side and to have a clear picture of the memory layout of this
+	 * structure. */
+	/* 10 */ __u16 pad;
+	/* 12 */ int offset;
+#if REISER4_DEBUG
+	unsigned long plug_v;
+	unsigned long body_v;
+#endif
+};
+
+#define INVALID_PLUGID  ((char)((1 << 8) - 1))
+#define INVALID_OFFSET -1
+
+static inline void
+coord_clear_iplug(coord_t * coord)
+{
+	assert("nikita-2835", coord != NULL);
+	coord->iplugid = INVALID_PLUGID;
+	coord->offset  = INVALID_OFFSET;
+}
+
+static inline int
+coord_is_iplug_set(const coord_t * coord)
+{
+	assert("nikita-2836", coord != NULL);
+	return coord->iplugid != INVALID_PLUGID;
+}
+
+static inline void
+coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
+{
+	assert("nikita-2478", coord != NULL);
+	coord->item_pos = pos;
+	coord_clear_iplug(coord);
+}
+
+static inline void
+coord_dec_item_pos(coord_t * coord)
+{
+	assert("nikita-2480", coord != NULL);
+	--coord->item_pos;
+	coord_clear_iplug(coord);
+}
+
+static inline void
+coord_inc_item_pos(coord_t * coord)
+{
+	assert("nikita-2481", coord != NULL);
+	++coord->item_pos;
+	coord_clear_iplug(coord);
+}
+
+static inline void
+coord_add_item_pos(coord_t * coord, int delta)
+{
+	assert("nikita-2482", coord != NULL);
+	coord->item_pos += delta;
+	coord_clear_iplug(coord);
+}
+
+static inline void
+coord_invalid_item_pos(coord_t * coord)
+{
+	assert("nikita-2832", coord != NULL);
+	coord->item_pos = (unsigned short)~0;
+	coord_clear_iplug(coord);
+}
+
+/* Reverse a direction. */
+static inline sideof
+sideof_reverse(sideof side)
+{
+	return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
+}
+
+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
+
+   "first" and "last"
+   "next" and "prev"
+   "before" and "after"
+   "leftmost" and "rightmost"
+
+   But I think the chosen names are decent the way they are.
+*/
+
+/* COORD INITIALIZERS */
+
+/* Initialize an invalid coordinate. */
+extern void coord_init_invalid(coord_t * coord, const znode * node);
+
+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to point at the first unit of the first item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+extern void coord_init_first_unit(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to point at the last unit of the last item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+extern void coord_init_last_unit(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to before the first item.  If the node is empty, it is
+   positioned at the EMPTY_NODE. */
+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
+   at the EMPTY_NODE. */
+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to after last unit in the item. Coord must be set
+   already to existing item */
+void coord_init_after_item_end(coord_t * coord);
+
+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
+void coord_init_before_item(coord_t *);
+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
+void coord_init_after_item(coord_t *);
+
+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
+extern void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir);
+
+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
+   it was not clear how actually
+   FIXME-VS: added by vs (2002, june, 8) */
+extern void coord_init_zero(coord_t * coord);
+
+/* COORD METHODS */
+
+/* after shifting of node content, coord previously set properly may become
+   invalid, try to "normalize" it. */
+void coord_normalize(coord_t * coord);
+
+/* Copy a coordinate. */
+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
+
+/* Copy a coordinate without check. */
+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
+
+unsigned coord_num_units(const coord_t * coord);
+
+/* Return the last valid unit number at the present item (i.e.,
+   coord_num_units() - 1). */
+static inline unsigned
+coord_last_unit_pos(const coord_t * coord)
+{
+	return coord_num_units(coord) - 1;
+}
+
+#if REISER4_DEBUG
+/* For assertions only, checks for a valid coordinate. */
+extern int coord_check(const coord_t * coord);
+
+extern unsigned long znode_times_locked(const znode *z);
+
+static inline void
+coord_update_v(coord_t * coord)
+{
+	coord->plug_v = coord->body_v = znode_times_locked(coord->node);
+}
+#endif
+
+extern int coords_equal(const coord_t * c1, const coord_t * c2);
+
+/* Returns true if two coordinates are consider equal.  Coordinates that are between units
+   or items are considered equal. */
+extern int coord_eq(const coord_t * c1, const coord_t * c2);
+
+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
+
+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
+   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
+extern coord_wrt_node coord_wrt(const coord_t * coord);
+
+/* Returns true if the coordinates are positioned at adjacent units, regardless of
+   before-after or item boundaries. */
+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
+
+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
+   NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2.  */
+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
+
+/* COORD PREDICATES */
+
+/* Returns true if the coord was initializewd by coord_init_invalid (). */
+extern int coord_is_invalid(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at an existing item, not before or after
+   an item.  It may be placed at, before, or after any unit within the item, whether
+   existing or not.  If this is true you can call methods of the item plugin.  */
+extern int coord_is_existing_item(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned after a item, before a item, after the
+   last unit of an item, before the first unit of an item, or at an empty node. */
+extern int coord_is_between_items(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
+   unit. */
+extern int coord_is_existing_unit(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at an empty node. */
+extern int coord_is_empty(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
+   true for empty nodes nor coordinates positioned before the first item. */
+extern int coord_is_leftmost_unit(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned after the last item or after the last unit
+   of the last item or it is an empty node. */
+extern int coord_is_after_rightmost(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned before the first item or it is an empty
+   node. */
+extern int coord_is_before_leftmost(const coord_t * coord);
+
+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
+   argument. */
+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
+
+/* COORD MODIFIERS */
+
+/* Advances the coordinate by one unit to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
+   an existing unit. */
+extern int coord_next_unit(coord_t * coord);
+
+/* Advances the coordinate by one item to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
+   an existing item. */
+extern int coord_next_item(coord_t * coord);
+
+/* Advances the coordinate by one unit to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing unit. */
+extern int coord_prev_unit(coord_t * coord);
+
+/* Advances the coordinate by one item to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing item. */
+extern int coord_prev_item(coord_t * coord);
+
+/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
+   non-zero if there is no position to the right. */
+extern int coord_set_to_right(coord_t * coord);
+
+/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
+   non-zero if there is no position to the left. */
+extern int coord_set_to_left(coord_t * coord);
+
+/* If the coordinate is at an existing unit, set to after that unit.  Returns 0 on success
+   and non-zero if the unit did not exist. */
+extern int coord_set_after_unit(coord_t * coord);
+
+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
+extern int coord_sideof_unit(coord_t * coord, sideof dir);
+
+/* iterate over all units in @node */
+#define for_all_units( coord, node )					\
+	for( coord_init_before_first_item( ( coord ), ( node ) ) ; 	\
+	     coord_next_unit( coord ) == 0 ; )
+
+/* iterate over all items in @node */
+#define for_all_items( coord, node )					\
+	for( coord_init_before_first_item( ( coord ), ( node ) ) ; 	\
+	     coord_next_item( coord ) == 0 ; )
+
+#if REISER4_DEBUG_OUTPUT
+extern const char *coord_tween_tostring(between_enum n);
+#endif
+
+/* COORD/ITEM METHODS */
+
+extern int item_utmost_child_real_block(const coord_t * coord, sideof side, reiser4_block_nr * blk);
+extern int item_utmost_child(const coord_t * coord, sideof side, jnode ** child);
+
+/* __REISER4_COORD_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/crypt.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/crypt.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/crypt.c	2004-08-22 19:35:33.621652697 +1000
@@ -0,0 +1,92 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* Crypto-plugins for reiser4 cryptcompress objects */
+
+#include "debug.h"
+#include "plugin/plugin.h"
+#include "plugin/cryptcompress.h"
+#include <linux/types.h>
+#include <linux/random.h>
+
+#define MAX_CRYPTO_BLOCKSIZE 128
+#define NONE_EXPKEY_WORDS 8
+#define NONE_BLOCKSIZE 8
+
+/*
+  Default align() method of the crypto-plugin (look for description of this method
+  in plugin/plugin.h)
+
+1) creates the aligning armored format of the input flow before encryption.
+   "armored" means that padding is filled by private data (for example,
+   pseudo-random sequence of bytes is not private data).
+2) returns length of appended padding
+
+   [ flow | aligning_padding ]
+            ^
+            |
+	  @pad
+*/
+UNUSED_ARG static int
+align_cluster_common(__u8 *pad /* pointer to the first byte of aligning format */,
+		     int flow_size /* size of non-aligned flow */,
+				int blocksize /* crypto-block size */)
+{
+	int pad_size;
+
+	assert("edward-01", pad != NULL);
+	assert("edward-02", flow_size != 0);
+	assert("edward-03", blocksize != 0 || blocksize <= MAX_CRYPTO_BLOCKSIZE);
+
+	pad_size = blocksize - (flow_size % blocksize);
+	get_random_bytes (pad, pad_size);
+	return pad_size;
+}
+
+/* common scale method (look for description of this method in plugin/plugin.h)
+   for all symmetric algorithms which doesn't scale anything
+*/
+static loff_t scale_common(struct inode * inode UNUSED_ARG,
+			   size_t blocksize UNUSED_ARG /* crypto block size, which is returned
+							  by blocksize method of crypto plugin */,
+			   loff_t src_off /* offset to scale */)
+{
+	return src_off;
+}
+
+REGISTER_NONE_ALG(crypt, CRYPTO)
+
+/* EDWARD-FIXME-HANS: why is this not in the plugin directory? */
+
+/* crypto plugins */
+crypto_plugin crypto_plugins[LAST_CRYPTO_ID] = {
+	[NONE_CRYPTO_ID] = {
+		.h = {
+			.type_id = REISER4_CRYPTO_PLUGIN_TYPE,
+			.id = NONE_CRYPTO_ID,
+			.pops = NULL,
+			/* If you wanna your files to not be crypto
+			   transformed, specify this crypto pluigin */
+			.label = "none",
+			.desc = "absence of crypto transform",
+			.linkage = TYPE_SAFE_LIST_LINK_ZERO
+		},
+		.alloc = alloc_none_crypt,
+		.free = free_none_crypt,
+		.nr_keywords = NONE_EXPKEY_WORDS,
+		.scale = scale_common,
+	        .align_cluster = NULL,
+	        .setkey = NULL,
+	        .encrypt = NULL,
+	        .decrypt = NULL
+	}
+};
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/debug.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/debug.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/debug.c	2004-08-22 19:35:33.622652538 +1000
@@ -0,0 +1,735 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Debugging facilities. */
+
+/*
+ * This file contains generic debugging functions used by reiser4. Roughly
+ * following:
+ *
+ *     panicking: reiser4_do_panic(), reiser4_print_prefix().
+ *
+ *     locking: schedulable(), lock_counters(), print_lock_counters(),
+ *     no_counters_are_held(), commit_check_locks()
+ *
+ *     {debug,trace,log}_flags: reiser4_are_all_debugged(),
+ *     reiser4_is_debugged(), get_current_trace_flags(),
+ *     get_current_log_flags().
+ *
+ *     kmalloc/kfree leak detection: reiser4_kmalloc(), reiser4_kfree(),
+ *     reiser4_kfree_in_sb().
+ *
+ *     error code monitoring (see comment before RETERR macro): return_err(),
+ *     report_err().
+ *
+ *     stack back-tracing: fill_backtrace()
+ *
+ *     miscellaneous: preempt_point(), call_on_each_assert(), debugtrap().
+ *
+ */
+
+#include "kattr.h"
+#include "reiser4.h"
+#include "context.h"
+#include "super.h"
+#include "txnmgr.h"
+#include "znode.h"
+
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/vmalloc.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+
+extern void cond_resched(void);
+
+/*
+ * global buffer where message given to reiser4_panic is formatted.
+ */
+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
+
+/*
+ * lock protecting consistency of panic_buf under concurrent panics
+ */
+static spinlock_t panic_guard = SPIN_LOCK_UNLOCKED;
+
+/* Your best friend. Call it on each occasion.  This is called by
+    fs/reiser4/debug.h:reiser4_panic(). */
+reiser4_internal void
+reiser4_do_panic(const char *format /* format string */ , ... /* rest */)
+{
+	static int in_panic = 0;
+	va_list args;
+
+	/*
+	 * check for recursive panic.
+	 */
+	if (in_panic == 0) {
+		in_panic = 1;
+
+		spin_lock(&panic_guard);
+		va_start(args, format);
+		vsnprintf(panic_buf, sizeof(panic_buf), format, args);
+		va_end(args);
+		printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
+		spin_unlock(&panic_guard);
+
+		/*
+		 * if kernel debugger is configured---drop in. Early dropping
+		 * into kgdb is not always convenient, because panic message
+		 * is not yet printed most of the times. But:
+		 *
+		 *     (1) message can be extracted from printk_buf[]
+		 *     (declared static inside of printk()), and
+		 *
+		 *     (2) sometimes serial/kgdb combo dies while printing
+		 *     long panic message, so it's more prudent to break into
+		 *     debugger earlier.
+		 *
+		 */
+		DEBUGON(1);
+
+		if (get_current_context_check() != NULL) {
+			struct super_block *super;
+			reiser4_context *ctx;
+
+			/*
+			 * if we are within reiser4 context, print it contents:
+			 */
+
+			/* lock counters... */
+			print_lock_counters("pins held", lock_counters());
+			/* other active contexts... */
+			print_contexts();
+			ctx = get_current_context();
+			super = ctx->super;
+			if (get_super_private(super) != NULL &&
+			    reiser4_is_debugged(super, REISER4_VERBOSE_PANIC))
+				/* znodes... */
+				print_znodes("znodes", current_tree);
+#if REISER4_DEBUG_CONTEXTS
+			{
+				extern spinlock_t active_contexts_lock;
+
+				/*
+				 * remove context from the list of active
+				 * contexts. This is precaution measure:
+				 * current is going to die, and leaving
+				 * context on the list would render latter
+				 * corrupted.
+				 */
+				spin_lock(&active_contexts_lock);
+				context_list_remove(ctx->parent);
+				spin_unlock(&active_contexts_lock);
+			}
+#endif
+		}
+	}
+	BUG();
+	/* to make gcc happy about noreturn attribute */
+	panic("%s", panic_buf);
+}
+
+reiser4_internal void
+reiser4_print_prefix(const char *level, int reperr, const char *mid,
+		     const char *function, const char *file, int lineno)
+{
+	const char *comm;
+	int   pid;
+
+	if (unlikely(in_interrupt() || in_irq())) {
+		comm = "interrupt";
+		pid  = 0;
+	} else {
+		comm = current->comm;
+		pid  = current->pid;
+	}
+	printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
+	       level, comm, pid, function, file, lineno, mid);
+	if (reperr)
+		report_err();
+}
+
+/* Preemption point: this should be called periodically during long running
+   operations (carry, allocate, and squeeze are best examples) */
+reiser4_internal int
+preempt_point(void)
+{
+	assert("nikita-3008", schedulable());
+	cond_resched();
+	return signal_pending(current);
+}
+
+#if REISER4_DEBUG
+
+/* check that no spinlocks are held */
+int schedulable(void)
+{
+	if (get_current_context_check() != NULL) {
+		if (!LOCK_CNT_NIL(spin_locked)) {
+			print_lock_counters("in atomic", lock_counters());
+			return 0;
+		}
+	}
+	might_sleep();
+	return 1;
+}
+#endif
+
+#if REISER4_DEBUG_SPIN_LOCKS
+/* Debugging aid: return struct where information about locks taken by current
+   thread is accumulated. This can be used to formulate lock ordering
+   constraints and various assertions.
+
+*/
+lock_counters_info *
+lock_counters(void)
+{
+	reiser4_context *ctx = get_current_context();
+	assert("jmacd-1123", ctx != NULL);
+	return &ctx->locks;
+}
+
+#if REISER4_DEBUG_OUTPUT
+/*
+ * print human readable information about locks held by the reiser4 context.
+ */
+void
+print_lock_counters(const char *prefix, const lock_counters_info * info)
+{
+	printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
+	       "jload: %i, "
+	       "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
+	       "ktxnmgrd: %i, fq: %i, reiser4_sb: %i\n"
+	       "inode: %i, "
+	       "cbk_cache: %i (r:%i,w%i), "
+	       "epoch: %i, eflush: %i, "
+	       "zlock: %i (r:%i, w:%i)\n"
+	       "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
+	       "d: %i, x: %i, t: %i\n", prefix,
+	       info->spin_locked_jnode,
+	       info->rw_locked_tree, info->read_locked_tree,
+	       info->write_locked_tree,
+
+	       info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
+
+	       info->spin_locked_jload,
+	       info->spin_locked_txnh,
+	       info->spin_locked_atom, info->spin_locked_stack,
+	       info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
+	       info->spin_locked_fq, info->spin_locked_super,
+	       info->spin_locked_inode_object,
+
+	       info->rw_locked_cbk_cache,
+	       info->read_locked_cbk_cache,
+	       info->write_locked_cbk_cache,
+
+	       info->spin_locked_epoch,
+	       info->spin_locked_super_eflush,
+
+	       info->rw_locked_zlock,
+	       info->read_locked_zlock,
+	       info->write_locked_zlock,
+
+	       info->spin_locked,
+	       info->long_term_locked_znode,
+	       info->inode_sem_r, info->inode_sem_w,
+	       info->d_refs, info->x_refs, info->t_refs);
+}
+
+/*
+ * return true, iff no locks are held.
+ */
+int
+no_counters_are_held(void)
+{
+	lock_counters_info *counters;
+
+	counters = lock_counters();
+	return
+		(counters->rw_locked_zlock == 0) &&
+		(counters->read_locked_zlock == 0) &&
+		(counters->write_locked_zlock == 0) &&
+		(counters->spin_locked_jnode == 0) &&
+		(counters->rw_locked_tree == 0) &&
+		(counters->read_locked_tree == 0) &&
+		(counters->write_locked_tree == 0) &&
+		(counters->rw_locked_dk == 0) &&
+		(counters->read_locked_dk == 0) &&
+		(counters->write_locked_dk == 0) &&
+		(counters->spin_locked_txnh == 0) &&
+		(counters->spin_locked_atom == 0) &&
+		(counters->spin_locked_stack == 0) &&
+		(counters->spin_locked_txnmgr == 0) &&
+		(counters->spin_locked_inode_object == 0) &&
+		(counters->spin_locked == 0) &&
+		(counters->long_term_locked_znode == 0) &&
+		(counters->inode_sem_r == 0) &&
+		(counters->inode_sem_w == 0);
+}
+
+/*
+ * return true, iff transaction commit can be done under locks held by the
+ * current thread.
+ */
+int
+commit_check_locks(void)
+{
+	lock_counters_info *counters;
+	int inode_sem_r;
+	int inode_sem_w;
+	int result;
+
+	/*
+	 * inode's read/write semaphore is the only reiser4 lock that can be
+	 * held during commit.
+	 */
+
+	counters = lock_counters();
+	inode_sem_r = counters->inode_sem_r;
+	inode_sem_w = counters->inode_sem_w;
+
+	counters->inode_sem_r = counters->inode_sem_w = 0;
+	result = no_counters_are_held();
+	counters->inode_sem_r = inode_sem_r;
+	counters->inode_sem_w = inode_sem_w;
+	return result;
+}
+
+/* REISER4_DEBUG_OUTPUT */
+#endif
+
+/* REISER4_DEBUG_SPIN_LOCKS */
+#endif
+
+/*
+ * check that all bits specified by @flags are set in ->debug_flags of the
+ * super block.
+ */
+reiser4_internal int
+reiser4_are_all_debugged(struct super_block *super, __u32 flags)
+{
+	return (get_super_private(super)->debug_flags & flags) == flags;
+}
+
+/*
+ * check that some bits specified by @flags are set in ->debug_flags of the
+ * super block.
+ */
+reiser4_internal int
+reiser4_is_debugged(struct super_block *super, __u32 flag)
+{
+	return get_super_private(super)->debug_flags & flag;
+}
+
+#if REISER4_TRACE
+/* tracing setup: global trace flags stored in global variable plus
+   per-thread trace flags plus per-fs trace flags.
+   */
+__u32 get_current_trace_flags(void)
+{
+	__u32 flags;
+	reiser4_context *ctx;
+
+	flags = 0;
+	ctx = get_current_context_check();
+	if (ctx) {
+		flags |= ctx->trace_flags;
+		flags |= get_super_private(ctx->super)->trace_flags;
+	}
+	return flags;
+}
+#endif
+
+#if REISER4_LOG
+
+/* log flags are stored in super block */
+__u32 get_current_log_flags(void)
+{
+	__u32 flags;
+	reiser4_context *ctx;
+
+	flags = 0;
+	ctx = get_current_context_check();
+	if (ctx)
+		flags = get_super_private(ctx->super)->log_flags;
+	return flags;
+}
+
+/* oid of file page events of which are to be logged */
+__u32 get_current_oid_to_log(void)
+{
+	__u32 oid;
+	reiser4_context *ctx;
+
+	oid = 0;
+	ctx = get_current_context_check();
+	if (ctx)
+		oid = get_super_private(ctx->super)->oid_to_log;
+	return oid;
+}
+
+#endif
+
+/* allocate memory. This calls kmalloc(), performs some additional checks, and
+   keeps track of how many memory was allocated on behalf of current super
+   block. */
+reiser4_internal void *
+reiser4_kmalloc(size_t size /* number of bytes to allocate */ ,
+		int gfp_flag /* allocation flag */ )
+{
+	void *result;
+
+	assert("nikita-3009", ergo(gfp_flag & __GFP_WAIT, schedulable()));
+
+	result = kmalloc(size, gfp_flag);
+	if (REISER4_DEBUG && result != NULL) {
+		unsigned int usedsize;
+		reiser4_super_info_data *sbinfo;
+
+		usedsize = ksize(result);
+
+		sbinfo = get_current_super_private();
+
+		assert("nikita-3459", usedsize >= size);
+		assert("nikita-1407", sbinfo != NULL);
+		reiser4_spin_lock_sb(sbinfo);
+		ON_DEBUG(sbinfo->kmalloc_allocated += usedsize);
+		reiser4_spin_unlock_sb(sbinfo);
+	}
+	return result;
+}
+
+/* release memory allocated by reiser4_kmalloc() and update counter. */
+reiser4_internal void
+reiser4_kfree(void *area /* memory to from */)
+{
+	assert("nikita-1410", area != NULL);
+	return reiser4_kfree_in_sb(area, reiser4_get_current_sb());
+}
+
+/* release memory allocated by reiser4_kmalloc() for the specified
+ * super-block. This is useful when memory is released outside of reiser4
+ * context */
+reiser4_internal void
+reiser4_kfree_in_sb(void *area /* memory to from */, struct super_block *sb)
+{
+	assert("nikita-2729", area != NULL);
+	if (REISER4_DEBUG) {
+		unsigned int size;
+		reiser4_super_info_data *sbinfo;
+
+		size = ksize(area);
+
+		sbinfo = get_super_private(sb);
+
+		reiser4_spin_lock_sb(sbinfo);
+		assert("nikita-2730", sbinfo->kmalloc_allocated >= (int) size);
+		ON_DEBUG(sbinfo->kmalloc_allocated -= size);
+		reiser4_spin_unlock_sb(sbinfo);
+	}
+	kfree(area);
+}
+
+
+#if defined(CONFIG_REISER4_NOOPT)
+void __you_cannot_kmalloc_that_much(void)
+{
+	BUG();
+}
+#endif
+
+#if REISER4_DEBUG
+
+/*
+ * fill "error site" in the current reiser4 context. See comment before RETERR
+ * macro for more details.
+ */
+void
+return_err(int code, const char *file, int line)
+{
+	if (code < 0 && is_in_reiser4_context()) {
+		reiser4_context *ctx = get_current_context();
+
+		if (ctx != NULL) {
+			fill_backtrace(&ctx->err.path,
+				       REISER4_BACKTRACE_DEPTH, 0);
+			ctx->err.code = code;
+			ctx->err.file = file;
+			ctx->err.line = line;
+		}
+	}
+}
+
+/*
+ * report error information recorder by return_err().
+ */
+void
+report_err(void)
+{
+	reiser4_context *ctx = get_current_context_check();
+
+	if (ctx != NULL) {
+		if (ctx->err.code != 0) {
+#ifdef CONFIG_FRAME_POINTER
+			int i;
+			for (i = 0; i < REISER4_BACKTRACE_DEPTH ; ++ i)
+				printk("0x%p ", ctx->err.path.trace[i]);
+			printk("\n");
+#endif
+			printk("code: %i at %s:%i\n",
+			       ctx->err.code, ctx->err.file, ctx->err.line);
+		}
+	}
+}
+
+#ifdef CONFIG_FRAME_POINTER
+
+extern int kswapd(void *);
+
+#include <linux/personality.h>
+#include "ktxnmgrd.h"
+#include "repacker.h"
+
+/*
+ * true iff @addr is between @start and @end
+ */
+static int is_addr_in(void *addr, void *start, void *end)
+{
+	return start < addr && addr < end;
+}
+
+/*
+ * stack back-tracing. Also see comments before REISER4_BACKTRACE_DEPTH in
+ * debug.h.
+ *
+ * Stack beck-trace is collected through __builtin_return_address() gcc
+ * builtin, which requires kernel to be compiled with frame pointers
+ * (CONFIG_FRAME_POINTER). Unfortunately, __builtin_return_address() doesn't
+ * provide means to detect when bottom of the stack is reached, and just
+ * crashed when trying to access non-existent frame.
+ *
+ * is_last_frame() function works around this (also see more advanced version
+ * in the proc-sleep patch that requires modification of core kernel code).
+ *
+ * This functions checks for common cases trying to detect that last stack
+ * frame was reached.
+ */
+static int is_last_frame(void *addr)
+{
+	if (addr == NULL)
+		return 1;
+	if (is_addr_in(addr, kswapd, wakeup_kswapd))
+		return 1;
+	else if (is_addr_in(addr, reiser4_repacker, repacker_d))
+		return 1;
+	else if (is_addr_in(addr, init_ktxnmgrd_context, ktxnmgrd_kick))
+		return 1;
+	else if (is_addr_in(addr, init_entd_context, done_entd_context))
+		return 1;
+	else if (!kernel_text_address((unsigned long)addr))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * fill stack back-trace.
+ */
+reiser4_internal void
+fill_backtrace(backtrace_path *path, int depth, int shift)
+{
+	int i;
+	void *addr;
+
+	cassert(REISER4_BACKTRACE_DEPTH == 4);
+	assert("nikita-3229", shift < 6);
+
+	/* long live Duff! */
+
+#define FRAME(nr)						\
+	case (nr):						\
+		addr  = __builtin_return_address((nr) + 2);	\
+		break
+
+	xmemset(path, 0, sizeof *path);
+	addr = NULL;
+	/*
+	 * we need this silly loop, because __builtin_return_address() only
+	 * accepts _constant_ arguments. It reminds of the duff device
+	 * (http://www.faqs.org/docs/jargon/D/Duff's-device.html) which
+	 * explains the reference above.
+	 */
+	for (i = 0; i < depth; ++ i) {
+		switch(i + shift) {
+			FRAME(0);
+			FRAME(1);
+			FRAME(2);
+			FRAME(3);
+			FRAME(4);
+			FRAME(5);
+			FRAME(6);
+			FRAME(7);
+			FRAME(8);
+			FRAME(9);
+			FRAME(10);
+		default:
+			impossible("nikita-3230", "everything is wrong");
+		}
+		path->trace[i] = addr;
+		if (is_last_frame(addr))
+			break;
+	}
+}
+#endif
+
+/*
+ * assert() macro calls this function on each invocation. This is convenient
+ * place to put some debugging code that has to be executed very
+ * frequently. _Very_.
+ */
+void call_on_each_assert(void)
+{
+	return;
+	/*
+	 * DON'T USE ASSERTIONS HERE :)
+	 */
+	if (is_in_reiser4_context()) {
+		reiser4_super_info_data *sinfo;
+		reiser4_context *ctx;
+
+		ctx = (reiser4_context *) current->journal_info;
+		sinfo = ctx->super->s_fs_info;
+		/* put checks here */
+	}
+}
+
+/* REISER4_DEBUG */
+#endif
+
+#if KERNEL_DEBUGGER
+/*
+ * this functions just drops into kernel debugger. It is a convenient place to
+ * put breakpoint in.
+ */
+void debugtrap(void)
+{
+	/* do nothing. Put break point here. */
+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
+	extern void breakpoint(void);
+	breakpoint();
+#endif
+}
+#endif
+
+
+/* debugging tool
+   use clog_op to make a record
+   use print_clog to see last CLOG_LENGTH record
+ */
+#define CLOG_LENGTH 256
+static spinlock_t clog_lock = SPIN_LOCK_UNLOCKED;
+
+typedef struct {
+	int id;
+	pid_t pid;
+	int op;
+	void *data1;
+	void *data2;
+} clog_t;
+
+clog_t clog[CLOG_LENGTH];
+
+int clog_start = 0;
+int clog_length = 0;
+int clog_id = 0;
+
+void
+clog_op(int op, void *data1, void *data2)
+{
+	spin_lock(&clog_lock);
+
+	if (clog_length == CLOG_LENGTH) {
+		clog[clog_start].id = clog_id ++;
+		clog[clog_start].op = op;
+		clog[clog_start].pid = current->pid;
+		clog[clog_start].data1 = data1;
+		clog[clog_start].data2 = data2;
+		clog_start ++;
+		clog_start %= CLOG_LENGTH;
+	} else {
+		assert("vs-1672", clog_start == 0);
+		clog[clog_length].id = clog_id ++;
+		clog[clog_length].op = op;
+		clog[clog_length].pid = current->pid;
+		clog[clog_length].data1 = data1;
+		clog[clog_length].data2 = data2;
+		clog_length ++;
+	}
+
+	spin_unlock(&clog_lock);
+}
+
+static const char *
+op2str(int op)
+{
+	static const char *op_names[OP_NUM] = {
+		"get-user-page",
+		"put_user-page",
+		"ex-write-in",
+		"ex-write-out",
+		"readp-in",
+		"readp-out",
+		"ex-write-in-nr-locks",
+		"ex-write-out-nr-locks",
+		"link-object",
+		"unlink-object"
+	};
+	assert("vs-1673", op < OP_NUM);
+	return op_names[op];
+}
+
+void
+print_clog(void)
+{
+	int i, j;
+
+	j = clog_start;
+	for (i = 0; i < clog_length; i ++) {
+		printk("%d(%d): id %d: pid %d, op %s, data1 %p, data2 %p\n",
+		       i, j, clog[j].id, clog[j].pid, op2str(clog[j].op), clog[j].data1, clog[j].data2);
+		j ++;
+		j %= CLOG_LENGTH;
+	}
+	printk("clog length %d\n", clog_length);
+}
+
+#if 0
+void
+print_symname(unsigned long address)
+{
+	char         *module;
+	const char   *name;
+	char          namebuf[128];
+	unsigned long offset;
+	unsigned long size;
+
+	name = kallsyms_lookup(address, &size, &offset, &module, namebuf);
+	if (name != NULL)
+		printk("  %s[%lx/%lx]", name, offset, size);
+}
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/debug.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/debug.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/debug.h	2004-08-22 19:35:33.623652379 +1000
@@ -0,0 +1,559 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Declarations of debug macros. */
+
+#if !defined( __FS_REISER4_DEBUG_H__ )
+#define __FS_REISER4_DEBUG_H__
+
+#include "forward.h"
+#include "reiser4.h"
+
+
+/* generic function to produce formatted output, decorating it with
+   whatever standard prefixes/postfixes we want. "Fun" is a function
+   that will be actually called, can be printk, panic etc.
+   This is for use by other debugging macros, not by users. */
+#define DCALL(lev, fun, reperr, label, format, ...)		\
+({								\
+	reiser4_print_prefix(lev, reperr, label, 		\
+			     __FUNCTION__, __FILE__, __LINE__);	\
+	fun(lev format "\n" , ## __VA_ARGS__);			\
+})
+
+/*
+ * cause kernel to crash
+ */
+#define reiser4_panic(mid, format, ...)				\
+	DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
+
+/* print message with indication of current process, file, line and
+   function */
+#define reiser4_log(label, format, ...) 				\
+	DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
+
+/* Assertion checked during compilation.
+    If "cond" is false (0) we get duplicate case label in switch.
+    Use this to check something like famous
+       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
+    in 3.x journal.c. If cassertion fails you get compiler error,
+    so no "maintainer-id".
+*/
+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
+
+#define noop   do {;} while(0)
+
+#if REISER4_DEBUG
+/* version of info that only actually prints anything when _d_ebugging
+    is on */
+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
+/* macro to catch logical errors. Put it into `default' clause of
+    switch() statement. */
+#define impossible(label, format, ...) 			\
+         reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
+/* assert assures that @cond is true. If it is not, reiser4_panic() is
+   called. Use this for checking logical consistency and _never_ call
+   this to check correctness of external data: disk blocks and user-input . */
+#define assert(label, cond)						\
+({									\
+	/* call_on_each_assert(); */					\
+	if (cond) {						\
+		/* put negated check to avoid using !(cond) that would lose \
+		 * warnings for things like assert(a = b); */		\
+		;							\
+	} else {							\
+		DEBUGON(1);						\
+		reiser4_panic(label, "assertion failed: %s", #cond);	\
+	}								\
+})
+
+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
+#define check_me( label, expr )	assert( label, ( expr ) )
+
+#define ON_DEBUG( exp ) exp
+
+extern int schedulable(void);
+extern void call_on_each_assert(void);
+
+#else
+
+#define dinfo( format, args... ) noop
+#define impossible( label, format, args... ) noop
+#define assert( label, cond ) noop
+#define check_me( label, expr )	( ( void ) ( expr ) )
+#define ON_DEBUG( exp )
+#define schedulable() might_sleep()
+
+/* REISER4_DEBUG */
+#endif
+
+#if REISER4_DEBUG_SPIN_LOCKS
+/* per-thread information about lock acquired by this thread. Used by lock
+ * ordering checking in spin_macros.h */
+typedef struct lock_counters_info {
+	int rw_locked_tree;
+	int read_locked_tree;
+	int write_locked_tree;
+
+	int rw_locked_dk;
+	int read_locked_dk;
+	int write_locked_dk;
+
+	int rw_locked_cbk_cache;
+	int read_locked_cbk_cache;
+	int write_locked_cbk_cache;
+
+	int rw_locked_zlock;
+	int read_locked_zlock;
+	int write_locked_zlock;
+
+	int spin_locked_jnode;
+	int spin_locked_jload;
+	int spin_locked_txnh;
+	int spin_locked_atom;
+	int spin_locked_stack;
+	int spin_locked_txnmgr;
+	int spin_locked_ktxnmgrd;
+	int spin_locked_fq;
+	int spin_locked_super;
+	int spin_locked_inode_object;
+	int spin_locked_epoch;
+	int spin_locked_super_eflush;
+	int spin_locked;
+	int long_term_locked_znode;
+
+	int inode_sem_r;
+	int inode_sem_w;
+
+	int d_refs;
+	int x_refs;
+	int t_refs;
+} lock_counters_info;
+
+extern lock_counters_info *lock_counters(void);
+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
+
+/* increment lock-counter @counter, if present */
+#define LOCK_CNT_INC(counter) IN_CONTEXT(++(lock_counters()->counter), 0)
+
+/* decrement lock-counter @counter, if present */
+#define LOCK_CNT_DEC(counter) IN_CONTEXT(--(lock_counters()->counter), 0)
+
+/* check that lock-counter is zero. This is for use in assertions */
+#define LOCK_CNT_NIL(counter) IN_CONTEXT(lock_counters()->counter == 0, 1)
+
+/* check that lock-counter is greater than zero. This is for use in
+ * assertions */
+#define LOCK_CNT_GTZ(counter) IN_CONTEXT(lock_counters()->counter > 0, 1)
+
+/* REISER4_DEBUG_SPIN_LOCKS */
+#else
+
+/* no-op versions on the above */
+
+typedef struct lock_counters_info {
+} lock_counters_info;
+#define lock_counters() ((lock_counters_info *)NULL)
+#define LOCK_CNT_INC(counter) noop
+#define LOCK_CNT_DEC(counter) noop
+#define LOCK_CNT_NIL(counter) (1)
+#define LOCK_CNT_GTZ(counter) (1)
+/* REISER4_DEBUG_SPIN_LOCKS */
+#endif
+
+/*
+ * back-trace recording. In several places in reiser4 we want to record stack
+ * back-trace for debugging purposes. This functionality is only supported
+ * when kernel was configured with CONFIG_FRAME_POINTER option.
+ */
+
+#ifdef CONFIG_FRAME_POINTER
+
+/*
+ * how many stack frames to record in back-trace.
+ *
+ * update debug.c:fill_backtrace() if you change this
+ */
+#define REISER4_BACKTRACE_DEPTH (4)
+
+/*
+ * data type to store stack back-trace
+ */
+typedef struct {
+	void *trace[REISER4_BACKTRACE_DEPTH];
+} backtrace_path;
+
+extern void fill_backtrace(backtrace_path *path, int depth, int shift);
+#else
+
+/* no-op versions on the above */
+
+typedef struct {} backtrace_path;
+#define fill_backtrace(path, depth, shift) noop
+
+#endif
+
+
+/* flags controlling debugging behavior. Are set through debug_flags=N mount
+   option. */
+typedef enum {
+	/* print a lot of information during panic. When this is on all jnodes
+	 * are listed. This can be *very* large output. Usually you don't want
+	 * this. Especially over serial line. */
+	REISER4_VERBOSE_PANIC = 0x00000001,
+	/* print a lot of information during umount */
+	REISER4_VERBOSE_UMOUNT = 0x00000002,
+	/* print gathered statistics on umount */
+	REISER4_STATS_ON_UMOUNT = 0x00000004,
+	/* check node consistency */
+	REISER4_CHECK_NODE = 0x00000008
+} reiser4_debug_flags;
+
+extern int reiser4_are_all_debugged(struct super_block *super, __u32 flags);
+extern int reiser4_is_debugged(struct super_block *super, __u32 flag);
+
+extern int is_in_reiser4_context(void);
+
+/*
+ * evaluate expression @e only if with reiser4 context
+ */
+#define ON_CONTEXT(e)	do {			\
+	if(is_in_reiser4_context()) {		\
+		e;				\
+	} } while(0)
+
+/*
+ * evaluate expression @e only when within reiser4_context and debugging is
+ * on.
+ */
+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
+
+#if REISER4_DEBUG_MODIFY
+/*
+ * evaluate expression @exp only if REISER4_DEBUG_MODIFY mode is on.
+ */
+#define ON_DEBUG_MODIFY( exp ) exp
+#else
+#define ON_DEBUG_MODIFY( exp )
+#endif
+
+/*
+ * complain about unexpected function result and crash. Used in "default"
+ * branches of switch statements and alike to assert that invalid results are
+ * not silently ignored.
+ */
+#define wrong_return_value( label, function )				\
+	impossible( label, "wrong return value from " function )
+
+/* Issue warning message to the console */
+#define warning( label, format, ... )					\
+	DCALL( KERN_WARNING, 						\
+	       printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
+
+/* mark not yet implemented functionality */
+#define not_yet( label, format, ... )				\
+	reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
+
+#if REISER4_TRACE
+/* helper macro for tracing, see trace_stamp() below. */
+#define IF_TRACE(flags, e) 							\
+	if(get_current_trace_flags() & (flags)) e
+#else
+#define IF_TRACE(flags, e) noop
+#endif
+
+/* just print where we are: file, function, line */
+#define trace_stamp( f )   IF_TRACE( f, reiser4_log( "trace", "" ) )
+/* print value of "var" */
+#define trace_var( f, format, var ) 				\
+        IF_TRACE( f, reiser4_log( "trace", #var ": " format, var ) )
+/* print output only if appropriate trace flag(s) is on */
+#define ON_TRACE( f, ... )   IF_TRACE(f, printk(__VA_ARGS__))
+
+/* tracing flags. */
+typedef enum {
+	/* trace nothing */
+	NO_TRACE = 0,
+	/* trace vfs interaction functions from vfs_ops.c */
+	TRACE_VFS_OPS = (1 << 0),	/* 0x00000001 */
+	/* trace plugin handling functions */
+	TRACE_PLUGINS = (1 << 1),	/* 0x00000002 */
+	/* trace tree traversals */
+	TRACE_TREE = (1 << 2),	/* 0x00000004 */
+	/* trace znode manipulation functions */
+	TRACE_ZNODES = (1 << 3),	/* 0x00000008 */
+	/* trace node layout functions */
+	TRACE_NODES = (1 << 4),	/* 0x00000010 */
+	/* trace directory functions */
+	TRACE_DIR = (1 << 5),	/* 0x00000020 */
+	/* trace flush code verbosely */
+	TRACE_FLUSH_VERB = (1 << 6),	/* 0x00000040 */
+	/* trace flush code */
+	TRACE_FLUSH = (1 << 7),	/* 0x00000080 */
+	/* trace carry */
+	TRACE_CARRY = (1 << 8),	/* 0x00000100 */
+	/* trace how tree (web) of znodes if maintained through tree
+	   balancings. */
+	TRACE_ZWEB = (1 << 9),	/* 0x00000200 */
+	/* trace transactions. */
+	TRACE_TXN = (1 << 10),	/* 0x00000400 */
+	/* trace object id allocation/releasing */
+	TRACE_OIDS = (1 << 11),	/* 0x00000800 */
+	/* trace item shifts */
+	TRACE_SHIFT = (1 << 12),	/* 0x00001000 */
+	/* trace page cache */
+	TRACE_PCACHE = (1 << 13),	/* 0x00002000 */
+	/* trace extents */
+	TRACE_EXTENTS = (1 << 14),	/* 0x00004000 */
+	/* trace locks */
+	TRACE_LOCKS = (1 << 15),	/* 0x00008000 */
+	/* trace coords */
+	TRACE_COORDS = (1 << 16),	/* 0x00010000 */
+	/* trace read-IO functions */
+	TRACE_IO_R = (1 << 17),	/* 0x00020000 */
+	/* trace write-IO functions */
+	TRACE_IO_W = (1 << 18),	/* 0x00040000 */
+
+	/* trace log writing */
+	TRACE_LOG = (1 << 19),	/* 0x00080000 */
+
+	/* trace journal replaying */
+	TRACE_REPLAY = (1 << 20),	/* 0x00100000 */
+
+	/* trace space allocation */
+	TRACE_ALLOC = (1 << 21),	/* 0x00200000 */
+
+	/* trace space reservation */
+	TRACE_RESERVE = (1 << 22),	/* 0x00400000 */
+
+	/* trace emergency flush */
+	TRACE_EFLUSH  = (1 << 23),	/* 0x00800000 */
+
+	/* trace ctails */
+	TRACE_CTAIL = (1 << 24),       /* 0x01000000 */
+
+	TRACE_PARSE = (1 << 25),       /* 0x02000000 */
+
+	TRACE_CAPTURE_COPY = (1 << 26), /* 0x04000000 */
+
+	TRACE_EXTENT_ALLOC = (1 << 27),      /* 0x08000000 */
+
+	TRACE_CAPTURE_ANONYMOUS = (1 << 28), /* 0x10000000 */
+
+	/* vague section: used to trace bugs. Use it to issue optional prints
+	   at arbitrary points of code. */
+	TRACE_BUG = (1 << 31),	/* 0x80000000 */
+
+	/* trace everything above */
+	TRACE_ALL = 0xffffffffu
+} reiser4_trace_flags;
+
+#if REISER4_LOG
+/* helper macro for tracing, see trace_stamp() below. */
+#define IF_LOG(flags, e) 							\
+	if(get_current_log_flags() & (flags)) e
+#else
+#define IF_LOG(flags, e) noop
+#endif
+
+/* log only if appropriate log flag(s) is on */
+#define ON_LOG( f, ... )   IF_LOG(f, printk(__VA_ARGS__))
+
+typedef enum {
+	WRITE_NODE_LOG = (1 << 0),      /* log [zj]node operations */
+	WRITE_PAGE_LOG = (1 << 1),	/* log make_extent calls */
+	WRITE_IO_LOG = (1 << 2), 	/* log i/o requests */
+	WRITE_TREE_LOG = (1 << 3), 	/* log internal tree operations */
+	WRITE_SYSCALL_LOG = (1 << 4),   /* log system calls */
+	READAHEAD_LOG = (1 << 5),       /* log read-ahead activity */
+	ALLOC_EXTENT_LOG = (1 << 6),    /* log extent allocation */
+	LOG_FILE_PAGE_EVENT = (1 << 7)	/* log events happened to certain file */
+} reiser4_log_flags;
+
+
+extern void reiser4_do_panic(const char *format, ...)
+__attribute__ ((noreturn, format(printf, 1, 2)));
+
+extern void reiser4_print_prefix(const char *level, int reperr, const char *mid,
+				 const char *function,
+				 const char *file, int lineno);
+
+extern int preempt_point(void);
+extern void reiser4_print_stats(void);
+
+extern void *reiser4_kmalloc(size_t size, int gfp_flag);
+extern void reiser4_kfree(void *area);
+extern void reiser4_kfree_in_sb(void *area, struct super_block *sb);
+extern __u32 get_current_trace_flags(void);
+extern __u32 get_current_log_flags(void);
+extern __u32 get_current_oid_to_log(void);
+
+#if REISER4_DEBUG_OUTPUT && REISER4_DEBUG_SPIN_LOCKS
+extern void print_lock_counters(const char *prefix,
+				const lock_counters_info * info);
+extern int no_counters_are_held(void);
+extern int commit_check_locks(void);
+#else
+#define print_lock_counters(p, i) noop
+#define no_counters_are_held() (1)
+#define commit_check_locks() (1)
+#endif
+
+#define REISER4_STACK_ABORT          (8192 - sizeof(struct thread_info) - 30)
+#define REISER4_STACK_GAP            (REISER4_STACK_ABORT - 100)
+
+#if REISER4_DEBUG_MEMCPY
+extern void *xmemcpy(void *dest, const void *src, size_t n);
+extern void *xmemmove(void *dest, const void *src, size_t n);
+extern void *xmemset(void *s, int c, size_t n);
+#else
+#define xmemcpy( d, s, n ) memcpy( ( d ), ( s ), ( n ) )
+#define xmemmove( d, s, n ) memmove( ( d ), ( s ), ( n ) )
+#define xmemset( s, c, n ) memset( ( s ), ( c ), ( n ) )
+#endif
+
+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
+#define IS_POW(i) 				\
+({						\
+	typeof(i) __i;				\
+						\
+	__i = (i);				\
+	!(__i & (__i - 1));			\
+})
+
+#define KERNEL_DEBUGGER (1)
+
+#if KERNEL_DEBUGGER
+/*
+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
+ * kgdb is not compiled in, do nothing.
+ */
+#define DEBUGON(cond)				\
+({						\
+	extern void debugtrap(void);		\
+						\
+	if (unlikely(cond))			\
+		debugtrap();			\
+})
+#else
+#define DEBUGON(cond) noop
+#endif
+
+/*
+ * Error code tracing facility. (Idea is borrowed from XFS code.)
+ *
+ * Suppose some strange and/or unexpected code is returned from some function
+ * (for example, write(2) returns -EEXIST). It is possible to place a
+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
+ * in what particular place -EEXIST was generated first?
+ *
+ * In reiser4 all places where actual error codes are produced (that is,
+ * statements of the form
+ *
+ *     return -EFOO;        // (1), or
+ *
+ *     result = -EFOO;      // (2)
+ *
+ * are replaced with
+ *
+ *     return RETERR(-EFOO);        // (1a), and
+ *
+ *     result = RETERR(-EFOO);      // (2a) respectively
+ *
+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
+ * printed in error and warning messages. Moreover, it's possible to put a
+ * conditional breakpoint in return_err (low-level function called by RETERR()
+ * to do the actual work) to break into debugger immediately when particular
+ * error happens.
+ *
+ */
+
+#if REISER4_DEBUG
+
+/*
+ * data-type to store information about where error happened ("error site").
+ */
+typedef struct err_site {
+	backtrace_path path; /* stack back trace of error */
+	int            code; /* error code */
+	const char    *file; /* source file, filled by __FILE__ */
+	int            line; /* source file line, filled by __LINE__ */
+} err_site;
+
+extern void return_err(int code, const char *file, int line);
+extern void report_err(void);
+
+/*
+ * fill &get_current_context()->err_site with error information.
+ */
+#define RETERR(code) 				\
+({						\
+	typeof(code) __code;			\
+						\
+	__code = (code);			\
+	return_err(__code, __FILE__, __LINE__);	\
+	__code;					\
+})
+
+#else
+
+/*
+ * no-op versions of the above
+ */
+
+typedef struct err_site {} err_site;
+#define RETERR(code) code
+#define report_err() noop
+#endif
+
+#if REISER4_LARGE_KEY
+/*
+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
+ */
+#define ON_LARGE_KEY(...) __VA_ARGS__
+#else
+#define ON_LARGE_KEY(...)
+#endif
+
+#if REISER4_ALL_IN_ONE
+/*
+ * declarator used by REISER4_ALL_IN_ONE mode. Every reiser4 function that is
+ * not used externally (that is, not used by non-reiser4 code) should be
+ * tagged with this. Normally it expands to nothing. In REISER4_ALL_IN_ONE
+ * expands to statics allowing compiler to perform better optimization.
+ */
+#define reiser4_internal static
+#else
+#define reiser4_internal
+#endif
+
+/* operations to clog */
+/* debugging re-enterance */
+
+#define GET_USER_PAGES 0
+#define PUT_USER_PAGES 1
+#define EXTENT_WRITE_IN 2
+#define EXTENT_WRITE_OUT 3
+#define READPAGE_IN 4
+#define READPAGE_OUT 5
+#define EXTENT_WRITE_IN2 6
+#define EXTENT_WRITE_OUT2 7
+#define LINK_OBJECT 8
+#define UNLINK_OBJECT 9
+
+#define OP_NUM 10
+
+void clog_op(int op, void *, void *);
+void print_clog(void);
+
+/* __FS_REISER4_DEBUG_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/dformat.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/dformat.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/dformat.h	2004-08-22 19:35:33.624652219 +1000
@@ -0,0 +1,164 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Formats of on-disk data and conversion functions. */
+
+/* put all item formats in the files describing the particular items,
+   our model is, everything you need to do to add an item to reiser4,
+   (excepting the changes to the plugin that uses the item which go
+   into the file defining that plugin), you put into one file. */
+/* Data on disk are stored in little-endian format.
+   To declare fields of on-disk structures, use d8, d16, d32 and d64.
+   d??tocpu() and cputod??() to convert. */
+
+#if !defined( __FS_REISER4_DFORMAT_H__ )
+#define __FS_REISER4_DFORMAT_H__
+
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/types.h>
+
+/* our default disk byteorder is little endian */
+
+#if defined( __LITTLE_ENDIAN )
+#define CPU_IN_DISK_ORDER  (1)
+#else
+#define CPU_IN_DISK_ORDER  (0)
+#endif
+
+/* code on-disk data-types as structs with a single field
+   to rely on compiler type-checking. Like include/asm-i386/page.h */
+typedef struct d8 {
+	__u8 datum;
+} d8 __attribute__ ((aligned(1)));
+typedef struct d16 {
+	__u16 datum;
+} d16 __attribute__ ((aligned(2)));
+typedef struct d32 {
+	__u32 datum;
+} d32 __attribute__ ((aligned(4)));
+typedef struct d64 {
+	__u64 datum;
+} d64 __attribute__ ((aligned(8)));
+
+#define PACKED __attribute__((packed))
+
+static inline __u8
+d8tocpu(const d8 * ondisk /* on-disk value to convert */ )
+{
+	return ondisk->datum;
+}
+
+static inline __u16
+d16tocpu(const d16 * ondisk /* on-disk value to convert */ )
+{
+	return __le16_to_cpu(get_unaligned(&ondisk->datum));
+}
+
+static inline __u32
+d32tocpu(const d32 * ondisk /* on-disk value to convert */ )
+{
+	return __le32_to_cpu(get_unaligned(&ondisk->datum));
+}
+
+static inline __u64
+d64tocpu(const d64 * ondisk /* on-disk value to convert */ )
+{
+	return __le64_to_cpu(get_unaligned(&ondisk->datum));
+}
+
+static inline d8 *
+cputod8(unsigned int oncpu /* CPU value to convert */ ,
+	d8 * ondisk /* result */ )
+{
+	assert("nikita-1264", oncpu < 0x100);
+	put_unaligned(oncpu, &ondisk->datum);
+	return ondisk;
+}
+
+static inline d16 *
+cputod16(unsigned int oncpu /* CPU value to convert */ ,
+	 d16 * ondisk /* result */ )
+{
+	assert("nikita-1265", oncpu < 0x10000);
+	put_unaligned(__cpu_to_le16(oncpu), &ondisk->datum);
+	return ondisk;
+}
+
+static inline d32 *
+cputod32(__u32 oncpu /* CPU value to convert */ ,
+	 d32 * ondisk /* result */ )
+{
+	put_unaligned(__cpu_to_le32(oncpu), &ondisk->datum);
+	return ondisk;
+}
+
+static inline d64 *
+cputod64(__u64 oncpu /* CPU value to convert */ ,
+	 d64 * ondisk /* result */ )
+{
+	put_unaligned(__cpu_to_le64(oncpu), &ondisk->datum);
+	return ondisk;
+}
+
+/* data-type for block number on disk: these types enable changing the block
+   size to other sizes, but they are only a start.  Suppose we wanted to
+   support 48bit block numbers.  The dblock_nr blk would be changed to "short
+   blk[3]".  The block_nr type should remain an integral type greater or equal
+   to the dblock_nr type in size so that CPU arithmetic operations work. */
+typedef __u64 reiser4_block_nr;
+
+/* data-type for block number on disk, disk format */
+union reiser4_dblock_nr {
+	d64 blk;
+};
+
+static inline reiser4_block_nr
+dblock_to_cpu(const reiser4_dblock_nr * dblock)
+{
+	return d64tocpu(&dblock->blk);
+}
+
+static inline void
+cpu_to_dblock(reiser4_block_nr block, reiser4_dblock_nr * dblock)
+{
+	cputod64(block, &dblock->blk);
+}
+
+/* true if disk addresses are the same */
+static inline int
+disk_addr_eq(const reiser4_block_nr * b1	/* first block
+						 * number to
+						 * compare */ ,
+	     const reiser4_block_nr * b2	/* second block
+						 * number to
+						 * compare */ )
+{
+	assert("nikita-1033", b1 != NULL);
+	assert("nikita-1266", b2 != NULL);
+
+	return !memcmp(b1, b2, sizeof *b1);
+}
+
+/* structure of master reiser4 super block */
+typedef struct reiser4_master_sb {
+	char magic[16];		/* "ReIsEr4" */
+	d16 disk_plugin_id;	/* id of disk layout plugin */
+	d16 blocksize;
+	char uuid[16];		/* unique id */
+	char label[16];		/* filesystem label */
+	d64 diskmap;		/* location of the diskmap. 0 if not present */
+} reiser4_master_sb;
+
+/* __FS_REISER4_DFORMAT_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/diskmap.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/diskmap.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/diskmap.c	2004-08-22 19:35:33.624652219 +1000
@@ -0,0 +1,76 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* Functions to deal with diskmap storage - read-only storage (currently can only be
+   set via fs-creation process) for use by various plugins */
+
+
+#include "debug.h"
+#include "super.h"
+#include "diskmap.h"
+
+#include <linux/fs.h>
+
+/* Looks through chain of diskmap blocks, looking for table entry where label and parameter
+   patch passed in "label" and "parameter"
+   Returns 0 on success, -1 if nothing was found or error have occurred. */
+reiser4_internal int
+reiser4_get_diskmap_value( u32 label, u32 parameter, u64 *value)
+{
+	struct super_block *sb = reiser4_get_current_sb();
+	int retval = -1;
+
+	assert("green-2006", label != REISER4_FIXMAP_END_LABEL && label != REISER4_FIXMAP_NEXT_LABEL);
+
+	if ( get_super_private(sb)->diskmap_block ) { /* If there is diskmap table, we need to read and parse it */
+		struct buffer_head *diskmap_bh;
+		struct reiser4_diskmap *diskmap;
+		int i = 0;
+
+		diskmap_bh = sb_bread(sb, get_super_private(sb)->diskmap_block);
+search_table:
+		if ( !diskmap_bh ) {
+			warning("green-2005", "Cannot read diskmap while doing bitmap checks");
+			return -1;
+		}
+
+		diskmap = (struct reiser4_diskmap *) diskmap_bh->b_data;
+		if ( strncmp(diskmap->magic, REISER4_FIXMAP_MAGIC, sizeof(REISER4_FIXMAP_MAGIC)-1 ) ) {
+			/* Wrong magic */
+			brelse(diskmap_bh);
+			warning("green-2004", "diskmap is specified, but its magic is wrong");
+			return -1;
+		}
+
+		/* Since entries in tables are sorted, we iterate until we hit item that we are looking for,
+		   or we reach end of whole fixmap or end of current block */
+		while (((d32tocpu(&diskmap->table[i].label) <= label) &&
+		       (d32tocpu(&diskmap->table[i].parameter) < parameter)) &&
+			/* Also check that we do not fall out of current block */
+			((sb->s_blocksize - sizeof(diskmap->magic))/sizeof(diskmap->table[0]) >= i))
+			i++;
+
+		if ( i > (sb->s_blocksize - sizeof(diskmap->magic))/sizeof(diskmap->table[0]) ) {
+			warning("green-2004", "diskmap block %Ld is not properly terminated", (long long)diskmap_bh->b_blocknr);
+			brelse(diskmap_bh);
+			return -1;
+		}
+
+		/* Is this last entry in current table that holds disk block with more data ? */
+		if ( d32tocpu(&diskmap->table[i].label) == REISER4_FIXMAP_NEXT_LABEL ) { /* Need to load next diskmap block */
+			sector_t next_diskmap_block = d64tocpu(&diskmap->table[i].value);
+			brelse(diskmap_bh);
+			diskmap_bh = sb_bread(sb, next_diskmap_block);
+			i = 0;
+			goto search_table;
+		}
+
+		/* See if we have found table entry we are looking for */
+		if ( (d32tocpu(&diskmap->table[i].label) == label) &&
+		     (d32tocpu(&diskmap->table[i].parameter) == parameter) ) {
+			*value = d64tocpu(&diskmap->table[i].value);
+			retval = 0;
+		}
+		brelse(diskmap_bh);
+	}
+
+	return retval;
+}
Index: linux-2.6.8.1-ck/fs/reiser4/diskmap.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/diskmap.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/diskmap.h	2004-08-22 19:35:33.625652060 +1000
@@ -0,0 +1,52 @@
+#if !defined (__REISER4_DISKMAP_H__)
+#define __REISER4_DISKMAP_H__
+
+/*
+ * Disk map.
+ *
+ * Disk map is a special data structure used by reiser4 as an optional
+ * "anchor" of other meta-data. That is, disk map (if present) may contain
+ * disk addresses of the rest of meta-data for this file system: master
+ * super-block, bitmaps, journal header and footer, etc. Disk map is used to
+ * avoid dependency on fixed disk addresses, with the following goals:
+ *
+ *     1. allow users to experiment with tuning their file system layout, and,
+ *     more importantly,
+ *
+ *     2. allow reiser4 to be survive bad blocks in critical disk locations.
+ *
+ * That is, disk map allows to "relocate" meta-data structures if their
+ * default disk addresses is not accessible.
+ *
+ * More generally, disk map can be used as a generic table used to store
+ * persistent parameters.
+ *
+ * Currently disk map is read-only for the kernel. It can only be
+ * constructed/modified by user-level utilities.
+ *
+ */
+
+#include "dformat.h"
+
+#define REISER4_FIXMAP_MAGIC "R4FiXMaPv1.0"
+
+#define REISER4_FIXMAP_END_LABEL -2
+#define REISER4_FIXMAP_NEXT_LABEL -1
+
+/* This is diskmap table, it's entries must be sorted ascending first in label
+   order, then in parameter order.  End of table is marked with label
+   REISER4_FIXMAP_END_LABEL label REISER4_FIXMAP_NEXT_LABEL means that value
+   in this row contains disk block of next diskmap in diskmaps chain */
+struct reiser4_diskmap {
+	char magic[16];
+	struct {
+		d32 label;
+		d32 parameter;
+		d64 value;
+	} table[0];
+};
+
+int reiser4_get_diskmap_value(u32, u32, u64 *);
+
+
+#endif
Index: linux-2.6.8.1-ck/fs/reiser4/doc/bk.HOWTO
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/bk.HOWTO	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/bk.HOWTO	2004-08-22 19:35:33.625652060 +1000
@@ -0,0 +1,192 @@
+
+	   MAINTENANCE OF REISER4 BITKEEPER REPOSITORY FOR THE CORE KERNEL.
+
+								   OVERVIEW
+
+Reiser4 receives linux kernel code from http://linux.bkbits.net/linux-2.5.
+
+This repository is pulled into laputa:~god/src/bk-linux-2.5 (BK-LINUX-2.5) by
+nightly cron job (~god/bin/update-src.sh). BK-LINUX-2.5 is only used as a
+local copy of Linus' repository, no changes are made in it.
+
+BK-LINUX-2.5 has child repository laputa:~god/projects/limbo (LIMBO), where it
+is merged with the set of patches to core kernel that are necessary for
+reiser4 to work.
+
+These patches are maintained through Andrew Morton's patch-scripts
+(http://www.zipworld.com.au/~akpm/linux/patches/). Local and slightly modified
+version of patch-scripts is installed at laputa:~god/run/patch-scripts/. See
+laputa:~god/run/patch-scripts/docco.txt for more detailed usage instructions.
+
+patch-scripts are needed, because reiser4 modifications to the core kernel
+should be available as a set of meaningful separately documented patches
+rather than as one single huge patch (that would result from just accumulating
+all changes in the bitkeeper).
+
+Patches themselves are stored in bitkeeper repository
+laputa:~god/projects/core-patches (CORE-PATCHES).
+
+New versions of the core kernel are pulled into LIMBO, merged with
+CORE-PATCHES, conflicts are resolved. This repository is cloned into temporary
+repositories to test resulting kernel. After testing LIMBO is cloned/pulled
+into thebsh:/usr/home/bk/reiser4-linux-2.6 (REISER4-LINUX-2.6). From there
+individual developers clone/pull it to their heart content.
+
+							 UPGRADE INSTRUCTIONS
+
+1. backup LIMBO:
+
+$ cd ~god/projects
+$ mv limbo limbo.orig
+
+2. clone BK-LINUX-2.5 into LIMBO
+
+$ bk clone ~god/src/bk-linux-2.5 limbo
+$ cd limbo
+$ bk -r edit -q
+
+3. roll LIMBO back to the desired kernel version: after a clone LIMBO contains
+some version of Linus' repository, but usually we want a repository
+corresponding to the exact kernel version. Use bk changes -L to find changeset
+number corresponding to the desired kernel version, then do bk undo -a<rev.no>
+to move the repository to that version.
+
+4. graft CORE-PATCHES into LIMBO
+
+$ bk clone ~god/projects/core-patches patches
+$ cd patches
+$ bk -r edit -q
+$ cd ../
+$ . patches/setpc # set patch-script variables
+
+5. check status of core patches:
+
+$ pstatus # patch-scripts utility
+1:a:2.6.6-mm2 Needs changelog
+2:a:all-sources.diff Needs changelog
+    ...
+35:a:disable-vermagic Needs changelog
+36:a:make-4kstack-option Needs changelog
+37:a:radix_tree_lookup_slot Needs changelog
+?:-:2.6.6-rc3-mm2 Needs changelog
+?:-:do_mmap2-fix.diff Needs changelog
+    ...
+
+Patches marked with ":a:" are applied (list of all currently applied patches
+is in patches/applied-patches). Patches marked with ":-:" are not
+applied. Patches with "?" (not numbered) are not included into "series"
+(patches/series file)---these are usually some old or testing patches no
+longer used.
+
+So, above pstatus output shows that there are 37 patches in the current
+series, all applied. This is normal situation. LIMBO and CORE-PATCHES
+repositories should always be left in such state after upgrading.
+
+6. Refresh core-patches.
+
+$ echo > patches/applied-patches # pretend patches are not applied
+
+Now for each patch do
+
+$ pushpatch # this applies next patch in series
+
+If patch could not applied successfully, "force it":
+
+$ pushpatch -f # this forces patch and generates .rej and .orig files
+
+Go through all generated .rej and .orig, resolve all conflicts, update
+sources. Delete .rej and .orig files.
+
+Independently of whether patch was applied successfully or not, finish its
+processing by refreshing it:
+
+$ refpatch # refresh current patch
+
+Repeat above pushpatch/refpatch sequence until patch-scripts report that
+"Series are fully applied".
+
+7. Commit changes. Simplest way to do this is to do bk citool in LIMBO. But
+this is time-consuming. Alternatively do:
+
+$ bk -r delta -a -q -y"local reiser4 patches applied"
+$ bk commit -q -y"local reiser4 patches applied"
+$ cd patches
+$ bk citool # revise and commit modifications to the local patches
+$ bk push # push modifications to the CORE-PATCHES
+$ cd ..
+
+8. Test resulting sources
+
+$ cd ~god/projects/tmp
+$ bk clone ../limbo
+
+and then follow standard build procedure: clone reiser4 repository into
+tmp/limbo/fs, configure, build, etc.
+
+9. Pull changes to thebsh
+
+$ ssh thebsh
+$ cd /usr/home/bk
+$ mv reiser4-linux-2.6 reiser4-linux-2.6.orig
+$ bk clone god@laputa:/home/god/projects/limbo reiser4-linux-2.6
+
+If everything is ok, remove backup repositories limbo.orig and
+reiser4-linux-2.6.orig.
+
+							   ADDING NEW PATCH
+
+There are two versions of adding-new-patch procedure: first for the "in-order"
+patches, second for the "external" patches, like -mm patch.
+
+								 In-order patch
+
+1. Prepare repositories:
+
+$ cd ~god/projects/limbo
+$ bk -r edit -q
+$ cd patches
+$ bk -r edit -q
+$ cd ..
+
+2. As was mentioned above, all patches in the series should be already
+applied. Put new patch into "patches":
+
+$ mv /tmp/new-patch.diff patches/patches/new-patch.patch
+
+.patch suffix is mandatory!
+
+$ pcpatch new-patch # this generates patches/pc/new-patch.pc
+$ vi patches/txt/new-patch.txt # add patch description
+$ echo new-patch.patch >> patches/series # add patch to the series file
+$ pushpatch # apply it.
+
+If patch couldn't be applied, "force" it and resolve conflicts, see above.
+
+$ refpatch # and refresh it.
+
+This again leaves repositories in the consistent state (all patches in the
+series are applied).
+
+3. Commit changes, don't forget to add new files to the CORE-PATCHES. See
+above.
+
+								External patch
+
+External patch (such as combined patch for -mm) sometimes has to be added at
+the beginning of series (rather than at the end as small patches are). Such
+patches are best added during upgrading. Specifically, step 6 becomes:
+
+6. Refresh core-patches.
+
+$ echo > patches/applied-patches # pretend patches are not applied
+$ cp /tmp/external.patch patches/patches
+$ pcpatch external
+$ vi patches/txt/external.txt
+$ vi patches/series # add "external.patch" at the appropriate place
+
+Proceed with pushpatch/refpatch as usual.
+
+To remove patch from series (for example, when upgrading to the new -mm
+kernel), just kill appropriate line in the patches/series.
+
+Nikita. 2004.05.25
Index: linux-2.6.8.1-ck/fs/reiser4/doc/directory-service
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/directory-service	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/directory-service	2004-08-22 19:35:33.626651900 +1000
@@ -0,0 +1,203 @@
+
+					  DIRECTORY SERVICE IN REISER4
+
+Directory is mapping from file name to file itself. This mapping is
+implemented through reiser4 internal balanced tree. Single global tree
+is used as global index of all directories as opposed to having tree per
+directory. Unfortunately file names cannot be used as keys until keys of
+variable length are implemented, or unreasonable limitations on maximal
+file name length are imposed. To work around this file name is hashed
+and hash is used as key in a tree. No hash function is perfect and there
+always be hash collisions, that is, file names having the same value of
+a hash. Previous versions of reiserfs (3.5 and 3.6) used "generation
+counter" to overcome this problem: keys for file names having the same
+hash value were distinguished by having different generation
+counters. This allowed to amortize hash collisions at the cost of
+reducing number of bits used for hashing. This "generation counter"
+technique is actually some ad hoc form of support for non-unique
+keys. Keeping in mind that some form of this have to be implemented
+anyway, it seems justifiable to implement more regular support for
+non-unique keys in reiser4.
+
+NON-UNIQUE KEYS
+
+1.
+
+Non-unique keys require changes in both tree lookup and tree update
+code. In addition some new API to iterate through items with identical
+keys is required.
+
+Before going into detail let's note that non-unique keys weakens
+traditional search tree invariant. Search tree with unique keys, keys of
+all items in a left sub-tree of given delimiting key are less than, and
+in the right sub-tree greater than or equal to the said key. In a search
+tree with non-unique keys both inequalities are not strict.
+
+2.
+
+Tree lookups: we require that node layout ->lookup() methods always
+return leftmost item with the key looked for. The same for item
+->lookup() method for items supporting units with non-unique
+keys. Standard node40 layout plugin handles this, see
+fs/reiser4/plugin/node/node40.c:node40_lookup().
+
+3.
+
+Tree balancing: it seems that only change here is the handling of
+weakened search tree invariant. This can be gathered from the
+observation that balancing never even compares keys, only tests them for
+equality. More thought/research is required though. Looking at the
+existing implementations (like Berkeley db) would be useful also.
+
+4.
+
+Iteration through items/unit with identical keys. There are two
+interfaces to iterating abstraction known as "external" (also known as
+"enumeration") and "internal" iterators.
+
+External iterator:
+
+external_iterator {
+  start();
+  next();
+  has_more_p();
+};
+
+external_iterator eit;
+
+for( eit.start() ; eit.has_more_p() ; ) {
+    object = eit.next();
+    ... do stuff with object ...
+}
+
+Internal operator:
+
+internal_iterator {
+    iterate( int ( *function )( object *obj ) );
+};
+
+internal_iterator iit;
+
+int do_stuff( object *obj )
+{
+   ... do stuff with obj ...
+}
+
+iit( &do_stuff );
+
+External iterator seems easier to use, but they are known to be hard to
+implement, especially for complex data-structures like trees (this is
+because of the amount of state that should be maintained in "eit"
+between its invocations).
+
+Internal iterators are harder to use in C, because new function has to
+be declared to perform actions on objects in sequence, but are obviously
+easier to implement.
+
+Given that in 4.0 version there will be only one client of this
+iteration API (viz. directory lookup routine), it seems that internal
+style is preferable for now. Later, external iterator interface can be
+added if necessary.
+
+IMPLEMENTATION OF DIRECTORIES:
+
+1.
+
+There will be many various directory services implemented through
+different plugins. Default directory plugin uses hashing techniques
+described above. Let's code-name in hdir.
+
+2.
+
+Directory consists of directory entries, stored in a tree in a form of
+directory items. Question about whether each directory entry should be
+separate item or they can be compressed into items is left open by now.
+First this decision is purely per-plugin decidable, second, compression
+is good for performance, but harder to implement.
+
+Single directory entry is binding between file-system object and
+directory. In hdir plugin it consists of full name of a file bound and
+key (or part thereof) of file's stat-data:
+
+typedef struct hdir_entry {
+    /**
+     * key of object stat-data. It's not necessary to store
+     * whole key here, because it's always key of stat-data, so minor packing
+     * locality and offset can be omitted here. But this relies on
+     * particular key allocation scheme for stat-data, so, for extensibility
+     * sake, whole key can be stored here.
+     *
+     * We store key as array of bytes, because we don't want 8-byte alignment
+     * of dir entries.
+     */
+    d8 sdkey[ sizeof( reiser4_key ) ];
+    /**
+     * file name. Null terminated string.
+     */
+    d8 name[ 0 ];
+} hdir_entry;
+
+4.
+
+On creation/linking/lookup of object "bar" in directory "foo" (foo/bar),
+we compose key of directory entry for this object. Key has the form
+
+/*
+ * XXX this should be discussed
+ */
+dirent_k = (locality=foo_object_id, objectid=???, offset=hash("bar"));
+
+Major packing locality of dirent_k is set to foo_object_id so that all
+objects (files) in this directory and their bodies are close to
+respective directory entries.
+
+It seems that no single key allocation policy for directory entries fits
+everyone's needs, so, this can be implemented as method of directory
+plugin. No then less, choice of default key allocation policy is still
+important decision, although not that important as in plugin-less
+file-system.
+
+4.
+
+Function
+
+int hdir_find_entry( inode *dir, const hdir_entry *entry,
+                     tween_coord *coord, lock_handle *lh );
+
+iterates through all directory entries in @dir that have the same key as
+@entry (scans hash-bucket), looking for exact match for entry->name.
+
+5.
+
+During ->create()/->link() hdir_find_entry() is used to find place to insert new
+item (and to check for -EEXIST).
+
+During ->lookup() hdir_find_entry() is used find entry for the file
+being looked for and to load stat-data afterwards.
+
+During ->unlink() hdir_find_entry() is used to find unit/item to be
+removed.
+
+NOTE ON ->lookup():
+
+VFS implements following protocol when creating new
+file (fs/namei.c:open_namei()):
+
+dentry hash is searched. If search is unsuccessful, file system
+->lookup() is called.
+If lookup didn't find name, call ->create()
+
+While this protocol spares file system from dealing with dcache locking,
+for reiserfs it means that tree traversal is performed twice during file
+creation/deletion. Possible solution is to cache results of ->lookup()
+(e.g, pointer to znode) in dentry and reuse then in ->create(). On the
+other hand, point cache have more or less the same effect and is more
+general.
+
+
+^ Local variables:
+^ mode-name: "Design Document"
+^ indent-tabs-mode: nil
+^ tab-width: 4
+^ eval: (progn (flyspell-mode) (flyspell-buffer))
+^ End:
Index: linux-2.6.8.1-ck/fs/reiser4/doc/lock-ordering
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/lock-ordering	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/lock-ordering	2004-08-22 19:35:33.627651741 +1000
@@ -0,0 +1,601 @@
+---------------------------------INTRODUCTION-----------------------------------
+
+This document tries to provide concise description of various "locking" issues
+in reiser4 code. There are two major areas here:
+
+1. locking as a device for the concurrency control: various synchronization
+objects are used to maintain integrity of shared data structures.
+
+2. (induced by the former) deadlocks, livelocks, missed wake ups, and alikes.
+
+"Locks" above means both standard synchronization primitives like mutexes,
+semaphores, condition variables and so on, and any other kind of object on
+which thread execution may "block". Waiting on io completion is not considered
+here, because hardware errors barred, it will ultimately finish regardless of
+any other threads and locks in the system (This only holds if io completion
+handlers don't acquire locks themselves.).
+
+-------------------------------LOCKS IN REISER4---------------------------------
+
+Reiser4 introduces following locks:
+
+1.  Per-super-block tree spin lock                              (tree_lock*)
+
+2.  Per-super-block delimiting key spin lock                    (dk_lock*)
+
+3.  Per-jnode spin lock                                         (jnode_lock*)
+
+4.  Per-znode lock with deadlock detection                      (longterm_lock)
+
+5.  Per-reiser4-inode spin lock                                 (inode_guard*)
+
+6.  Per-atom spin lock                                          (atom_lock*)
+
+7.  Per-transaction-handle spin lock                            (txnh_lock*)
+
+8.  Per-transaction-manager spin lock                           (txnmgr_lock*)
+
+9.  Per-lock-stack spin-lock                                    (stack_lock*)
+
+10. Per-inode read-write lock                                   (inode_rw_lock)
+
+11. Per-super-block spin lock                                   (super_guard*+)
+
+12. Per-flushing-thread spin lock                               (ktxnmgrd_lock)
+
+13. Global lnode hash table lock                                (lnode_guard+)
+
+14. Per-super-block cbk cache spin lock                         (cbk_guard)
+
+15. Per-jnode spin lock used by debugging code to access and
+    modify check sum                                            (cksum_guard+)
+
+16. Per-super-block oid map spin lock                           (oid_guard+)
+
+17. Per-super-block spin lock used by "test" disk format plugin to serialize
+    block allocation                                            (test_lock+)
+
+18. Per-condition-variable spin lock                            (kcond_lock+)
+
+19. Single spin lock used to serialize fake block allocation    (fake_lock+)
+
+20. Single spin lock used to serialize calls to reiser4_panic   (panic_guard+)
+
+21. Single spin lock used by debugging code to keep track of all active
+    reiser4_context instances                                   (contexts_lock+)
+
+22. Per-lnode condition variable used by wait for completion of "incompatible
+    access mode"                                                (lnode_kcond)
+
+23. Per-flushing-thread condition variable for startup waiting  (ktxnmgrd_start)
+
+24. Per-flushing-thread condition variable                      (ktxnmgrd_wait)
+
+25. Per-lock-stack wakeup semaphore                             (stack_sema)
+
+26. Per-super-block flush serializing semaphore                 (flush_sema)
+
+27. Per-transaction-manager commit semaphore                    (commit_sema)
+
+28. Per-super-block semaphore used to arbitrate use of 5%       (delete_sema)
+    reserved disk space
+
+30. Global spin lock used to serialize calls to panic           (panic_guard+)
+
+31. Global spin lock used to protect plugin set hash table      (pset_guard+)
+
+32. Global spin lock used to protect phash hash table           (phash_guard+)
+
+33. Per-bitmap-block semaphore used to serialize bitmap loading (bnode_sema+)
+
+34. Per-super-block epoch lock, protecting updates to           (epoch_lock*)
+    znode_epoch field, used to implement seals (seal.[ch])
+    efficiently.
+
+35. Per-atom "event". This is not really lock. Rather, this is an event
+    signaled each time atom changes its state.                  (atom_event)
+
+36. Per-znode spin lock used to protect long term locking
+    structures                                                  (zlock*)
+
+37. Per flush queue lock                                        (fq_lock*)
+
+38. Per-super-block zgen lock, protecting znode generation      (zgen*)
+    counter
+
+39. Per-jnode spin lock used to synchronize jload() with        (jload_lock*)
+    ->releasepage().
+
+40. Per-atom imaginary read-write semaphore handle_sema         (handle_sema)
+
+    let's pretend for the sake of simplicity that there is special per-atom
+    read-write semaphore that threads can claim. Call it
+    handle_sema. This semaphore is acquired on read when thread captures first
+    block and is released when thread's reiser4_context is closed. Formally
+    thread holds this semaphore on read exactly when
+    get_current_context()->trans->atom != NULL, i.e., when thread is
+    associated with atom. Logic behind introducing this imaginary semaphore is
+    that while some thread is associated with an atom (that is, keeps
+    transaction handle opened), this atom cannot commit. In particular, other
+    threads waiting on fusion with atom that is in CAPTURE_WAIT stage wait
+    until this atom commits, that is wait (at least) until there are no opened
+    transaction handles for this atom. Effectively such threads wait until
+    handle_semaphore is free, that is, they in some sense are trying to
+    acquire handle_semaphore in write mode.  So, this circumferential
+    description allows one to reduce (at least partially) problem of waiting
+    on atom fusion to the lock ordering.
+
+41. Per-super-block spin lock protecting consistency of emergency flush hash
+    table, ->eflushed, and ->eflushed_anon counters in inode, and ->flushed
+    counter in atom.                                            (eflush_guard)
+
+42. Per-super-block spin lock protecting detached directory cursors for
+    stateless readdir                                           (d_lock)
+
+99. Various locks used by the user level simulator
+
+Locks marked by (*) after label, are accessed through spin lock macros,
+defined in reiser4.h. For them, locking ordering is checked at the runtime (at
+least in the principle) when REISER4_DEBUG is on(e).
+
+Locks marked by (+) after label exist only for serializing concurrent access
+to the shared data and are not supposed to be used in conjunction with any
+other locks. They are omitted from locking ordering below to simplify the
+picture. One can imaging them to be rightmost in the ordering.
+
+All locks, spin locks, and semaphores, except for stack_sema are subject to
+normal protocol: thread that grabbed the lock will release it. stack_sema is
+described in more details below.
+
+Also, following kernel locks are used by our code:
+
+1. Per-page lock                                                (page_lock)
+
+2. Per-page writeback bit                                       (page_write)
+
+3. Per-inode semaphore                                          (i_sem)
+
+4. Per-inode I_LOCK bit-lock                                    (I_LOCK)
+
+Thread also can block on the following "objects" that are not really locks:
+
+1. Page fault                                                   (pfault)
+
+2. Memory allocation                                            (kalloc)
+
+3. Dirtying a page (through balance_dirty_pages())              (page_dirty)
+
+----------------------------------LOCK SCOPE------------------------------------
+
+Section describing what data are protected by what locks. TBD.
+
+----------------------------------INVARIANTS------------------------------------
+
+Invariants are some (formal or informal) properties of data structures. For
+example, for well-formed doubly linked list, following holds:
+
+item->next->prev == item && item->prev->next == item
+
+In most cases, invariants only hold under proper locks.
+
+LABEL AND DESCRIPTION                                 LOCKS
+
+[inode->eflushed]                                     inode_guard
+
+    inode->eflushed > 0, iff there are emergency flushed jnodes belonging to
+    this inode. Also, each emergency flushed jnode is counted as increase in
+    inode->i_count.
+
+[cbk-cache-invariant]                                 cbk_guard
+
+    If cbk cache is traversed in LRU order, first go all used slots (with
+    slot->node != NULL), then, all unused. All used slots have unique
+    slot->node. (Checked by cbk_cache_invariant().)
+
+[znode-fake]                                          jnode_lock, tree_lock
+
+    /* fake znode doesn't have a parent, and */
+    znode_get_level(node) == 0 => znode_parent(node) == NULL, and
+    /* there is another way to express this very check, and */
+    znode_above_root(node)     => znode_parent(node) == NULL, and
+    /* it has special block number, and */
+    znode_get_level(node) == 0 => *znode_get_block(node) == FAKE_TREE_ADDR, and
+    /* it is the only znode with such block number, and */
+    !znode_above_root(node) && znode_is_loaded(node) =>
+                                  *znode_get_block(node) != FAKE_TREE_ADDR
+    /* it is parent of the tree root node */
+    znode_is_true_root(node)   => znode_above_root(znode_parent(node))
+
+    (Checked by znode_invariant_f().)
+
+[znode-level]                                         jnode_lock, tree_lock
+
+    /* level of parent znode is one larger than that of child, except for the
+       fake znode */
+    znode_parent(node) != NULL && !znode_above_root(znode_parent(node)) =>
+                znode_get_level(znode_parent(node)) == znode_get_level(node) + 1
+    /* left neighbor is at the same level, and */
+    znode_is_left_connected(node) && node->left != NULL =>
+                znode_get_level(node) == znode_get_level(node->left))
+    /* right neighbor is at the same level */
+    znode_is_right_connected(node) && node->right != NULL =>
+                znode_get_level(node) == znode_get_level(node->right)
+
+    (Checked by znode_invariant_f().)
+
+[znode-connected]
+
+     /* ->left, ->right pointers form a valid list and are consistent with
+     JNODE_{LEFT,RIGHT}_CONNECTED bits */
+
+     node->left != NULL => znode_is_left_connected(node)
+     node->right != NULL => znode_is_right_connected(node)
+     node->left != NULL =>
+		      znode_is_right_connected(node->left) &&
+		      node->left->right == node
+     node->right != NULL =>
+		      znode_is_left_connected(node->right) &&
+		      node->right->left == node
+
+[znode-c_count]                                       jnode_lock, tree_lock
+
+    /* for any znode, c_count of its parent is greater than 0, and */
+    znode_parent(node) != NULL && !znode_above_root(znode_parent(node)) =>
+                atomic_read(&znode_parent(node)->c_count) > 0), and
+    /* leaves don't have children */
+    znode_get_level(node) == LEAF_LEVEL => atomic_read(&node->c_count) == 0
+
+    (Checked by znode_invariant_f().)
+
+[znode-modify]                                        zlock_lock(read),
+                                                      jnode_lock, tree_lock
+
+    /* if znode is not write-locked, its checksum remains
+     * invariant */
+	!znode_is_wlocked(node) => znode_at_read(node)
+
+    (Checked by znode_invariant_f().)
+
+[znode-refs]                                          jnode_lock, tree_lock
+
+    /* only referenced znode can be long-term locked */
+    znode_is_locked(node) => atomic_read(&ZJNODE(node)->x_count) != 0
+
+    (Checked by znode_invariant_f().)
+
+[jnode-oid]                                           jnode_lock, tree_lock
+
+    /* for unformatted node ->objectid and ->mapping fields are
+     * consistent */
+    jnode_is_unformatted(node) && node->key.j.mapping != NULL =>
+        node->key.j.objectid == get_inode_oid(node->key.j.mapping->host)
+
+    (Checked by znode_invariant_f().)
+
+[jnode-refs]                                          jnode_lock, tree_lock
+
+    /* only referenced jnode can be loaded */
+    atomic_read(&node->x_count) >= node->d_count
+
+    (Checked by jnode_invariant_f().)
+
+[jnode-dirty]                                         jnode_lock, tree_lock
+
+    /* dirty inode is part of atom */
+    jnode_is_dirty(node) => node->atom != NULL
+
+    (Checked by jnode_invariant_f().)
+
+[jnode-queued]                                         jnode_lock, tree_lock
+
+    /* only relocated node can be queued, except that when znode
+     * is being deleted, its JNODE_RELOC bit is cleared */
+    JF_ISSET(node, JNODE_FLUSH_QUEUED) =>
+		      JF_ISSET(node, JNODE_RELOC) || JF_ISSET(node, JNODE_HEARD_BANSHEE)
+
+    (Checked by jnode_invariant_f().)
+
+[jnode-atom-valid]                                     jnode_lock, tree_lock
+
+    /* node atom has valid state */
+    node->atom != NULL => node->atom->stage != ASTAGE_INVALID
+
+    (Checked by jnode_invariant_f().)
+
+[jnode-page-binding]                                    jnode_lock, tree_lock
+
+    /* if node points to page, it points back to node */
+    node->pg != NULL => node->pg->private == node
+
+    (Checked by jnode_invariant_f().)
+
+[sb-block-counts]                                     super_guard
+
+	reiser4_block_count(super) = reiser4_grabbed_blocks(super) +
+                                 reiser4_free_blocks(super) +
+                                 reiser4_data_blocks(super) +
+                                 reiser4_fake_allocated(super) +
+                                 reiser4_fake_allocated_unformatted(super) +
+                                 reiser4_flush_reserved(super)
+
+    (Checked by check_block_counters().)
+
+[sb-grabbed]                                          super_guard
+
+    reiser4_grabbed_blocks(super) equals the sum of ctx->grabbed_blocks for
+    all grabbed contexts
+
+[sb-fake-allocated]                                   txnmgr_lock, atom_lock
+
+    When all atoms and transaction manager are locked,
+    reiser4_flush_reserved(super) equals to sum of atom->flush_reserved for
+    all atoms.
+
+[tap-sane]
+
+    tap->mode is one of {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
+	tap->coord != NULL, and
+	tap->lh != NULL, and
+	tap->loaded > 0 => znode_is_loaded(tap->coord->node), and
+	tap->coord->node == tap->lh->node
+
+    (Checked by tap_invariant().)
+
+--------------------------------LOCK ORDERING-----------------------------------
+
+Lock ordering for kernel locks is taken from mm/filemap.c. Locks can be taken
+from the left to the right. Locks on the same indentation level are unordered
+with respect to each other. Any spin lock is righter than any long term lock,
+obviously.
+
+i_sem
+..inode_rw_lock <-------DEAD1-----+
+....handle_sema                   |
+......I_LOCK                      |
+......delete_sema                 |
+......flush_sema                  |
+........atom_event                |
+........longterm_lock <---DEAD2-+ |
+......commit_sema               | |
+..........page_lock             | |
+............pfault              | |
+..............mm->mmap_sem------+-+                   [do_page_fault]
+..................ktxnmgrd_lock
+................mapping->i_shared_sem
+................kalloc
+....................inode_guard
+......................d_lock
+....................txnmgr_lock
+......................atom_lock
+..........................super_guard
+........................jnode_lock            [->vm_writeback()->jget()]
+................................eflush_guard
+..........................txnh_lock
+............................zlock
+........................fq_lock
+..............................stack_lock
+..................dk_lock
+..............................tree_lock
+................................cbk_guard
+................................epoch_lock
+................................zgen_lock
+..........................jload_lock
+....................mm->page_table_lock
+......................mapping->private_lock
+........................swaplock
+..........................swap_device_lock
+..........................&inode_lock
+............................&sb_lock
+............................mapping->page_lock
+..............................zone->lru_lock
+                  ^
+                  +-- spin locks are starting here. Don't schedule rightward.
+
+NOT FINISHED.
+
+..............&cache_chain_sem
+......................cachep->spinlock
+......................zone->lock
+
+page_dirty
+....&inode_lock
+....&sb_lock
+....mapping->page_lock [mpage_writepages]
+..page_lock
+..longterm_lock        [__set_page_dirty_buffers->__mark_inode_dirty]
+
+Nice and clear picture with all reiser4 locks totally ordered, right?
+
+Unfortunately, it is not always possible to adhere to this ordering. When it
+is necessary to take locks "decreasing" order, standard trylock-and-repeat
+loop is employed. See:
+
+   atom_get_locked_with_txnh_locked(),
+   atom_get_locked_by_jnode(),
+   atom_free(), and
+   jnode_lock_page()
+
+functions for examples of this.
+
+The only exception from the above locking oder is when thread wants to lock
+object it is just created and hasn't yet announced to other threads (by means
+of placing it into some shared data structure like hash table or list). There
+is special spin lock macro spin_lock_foo_no_ord() defined in reiser4.h for
+this purpose.
+
+pfault and kalloc are something special: when page fault occurs at the page
+occupied by mmapped from reiser4 file, reiser4_readpage() is invoked that
+starts taking locks from the very beginning.
+
+DEAD1
+
+   Scenario:
+
+      process has mmapped reiser4 regular file and then does write(2) into
+      this file from buffer that is in mmaped area. copy_from_user() causes
+      page fault:
+
+         sys_write()
+           reiser4_write()
+             unix_file_write() [inode_rw_lock]
+                         .
+                         .
+                         .
+                 __copy_from_user()
+                         .
+                         .
+                         .
+                   handle_page_fault()
+                     handle_mm_fault()
+                       handle_pte_fault()
+                         do_no_page()
+                           unix_file_filemap_nopage() [inode_rw_lock]
+
+   This is safe, because inode_rw_lock is read-taken by both read/write and
+   unix_file_filemap_nopage(). It is only write-taken during tail<->extent
+   conversion and if file is mmaped is was already converted to extents.
+
+DEAD2
+
+   is safe, because copy_from_user is used only for tails and extents:
+
+    . extent: extent_write_flow() releases longterm_lock before calling
+      copy_from_user.
+
+    . tail: during copying into tail, only node containing this tail is long
+      term locked. It is easy to see, that ->readpage serving page fault (that
+      is, readpage for unformatted data) will never attempt to lock said node.
+
+When memory allocation tries to free some memory it
+
+1. asynchronously launches kswapd that will ultimately call
+   reiser4_writepage().
+
+2. calls reiser4_writepage() synchronously.
+
+----------------------------------LOCK PATTERNS---------------------------------
+
+This section describes where in the code what locks sequences are held. This
+places restrictions on modifications to the lock ordering above and enumerates
+pieces of the code that should be revised if modification of the lock ordering
+is necessary.
+
+flush_sema
+
+    jnode_flush()
+
+        to serialize flushing. This behavior can be disabled with mtflush
+        mount option.
+
+atom_lock->jnode_lock
+
+    uncapture_block()
+
+atom_lock->tree_lock && jnode_lock && page_lock
+
+    uncapture_block() calls jput()
+
+delete_sema
+
+    common_unlink(), shorten_file()->unlink_check_and_grab()
+
+        to serialize access to reserved 5% of disk only used by unlinks. (This
+        is necessary so that it is always possible to unlink something and
+        free more space on file-system.)
+
+delete_sema->flush_sema || commit_sema
+
+    reiser4_release_reserved() calls txnmgr_force_commit_current_atom() under
+    delete_sema
+
+inode_rw_lock->delete_sema
+
+    unix_file_truncate()->shorten_file() takes delete_sema from under write
+    mode of inode_rw_lock
+
+kalloc->jnode_lock
+
+    emergency_flush() takes jnode spin lock
+
+jnode_lock->(mapping->page_lock)
+
+    jnode_set_dirty()->__set_page_dirty_nobuffers()
+
+jnode_lock->(zone->lru_lock)
+
+    jnode_set_dirty()->mark_page_accessed()
+
+
+I_LOCK->longterm_lock
+
+    reiser4_iget()
+
+tree_lock->epoch_lock
+
+    zget() calls znode_build_version()
+
+jnode_lock->stack_lock
+
+    longterm_lock_znode(), longterm_unlock_znode(), wake_up_all_lopri_owners()
+
+tree_lock->cbk_guard
+
+    znode_remove() calls cbk_cache_invalidate()
+
+zlock->stack_lock
+
+    wake_up_all_lopri_owners()
+
+atom->stack_lock
+
+    check_not_fused_lock_owners()
+
+txnh->stack_lock
+
+    check_not_fused_lock_owners()
+
+jnode_lock->jload_lock
+
+    reiser4_releasepage(), emergency_flush(). But this can actually be made
+    other way around.
+
+jnode_lock->eflush_guard
+
+    eflush_add(), eflush_del()
+
+atom_lock->super_guard
+
+    grabbed2flush_reserved_nolock()
+
+inode_guard->d_lock
+
+    detach_fsdata()
+
+----------------------------------DEADLOCKS-------------------------------------
+
+Big section describing found/possible/already-worked-around deadlocks.
+
+1. Locking during tree traversal.
+
+2. Locking during balancing.
+
+3. Locking during squalloc.
+
+4. Page locking.
+
+5. Atom fusion.
+
+Please, fill gaps up.
+
+TBD.
+
+2002.09.19. Nikita.
+
+--------------------------------------------------------------------------------
+
+^ Local variables:
+^ mode-name: "Memo"
+^ indent-tabs-mode: nil
+^ tab-width: 4
+^ eval: (progn (flyspell-mode) (flyspell-buffer))
+^ End:
Index: linux-2.6.8.1-ck/fs/reiser4/doc/lock-ordering.dot
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/lock-ordering.dot	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/lock-ordering.dot	2004-08-22 19:35:33.628651582 +1000
@@ -0,0 +1,276 @@
+/* this is dot(1) input file for lock-ordering diagram */
+/* it should be passed through C preprocessor first */
+/* cpp -P -DFITPAGE lock-ordering.dot | tred | dot -Tps | gv -media a4 - */
+
+#define CATTR fontsize=14, fontname=Helvetica
+#define NATTR CATTR
+#define EATTR CATTR
+
+#define SYSATTR color=yellow, style=filled
+#define PSEUDOATTR color=pink, style=filled, peripheries=2
+
+#define LONGATTR shape=ellipse
+#define SPINATTR shape=box
+
+#define CONDATTR color=blue, peripheries=2, LONGATTR
+
+#define MARKLONG(name) name -> schedulable [style=invis, weight=0]
+
+#define SYSLONG(name, l) name [label=l, NATTR, LONGATTR, SYSATTR]; MARKLONG(name)
+#define SYSPSEUDO(name) name [NATTR, LONGATTR, PSEUDOATTR]; MARKLONG(name)
+#define RLONG(name) name [NATTR, LONGATTR]; MARKLONG(name)
+
+#define RCOND(name, l) name [label=l, NATTR, CONDATTR]; MARKLONG(name)
+
+#define MARKSPIN(name) schedulable -> name [style=invis, weight=0]
+
+#define SYSSPIN(name, l) name [label=l, NATTR, SYSATTR, SPINATTR]; MARKSPIN(name)
+#define RSPIN(name) name [NATTR, SPINATTR]; MARKSPIN(name)
+
+#define ARC(from, to, func, ...) from -> to [EATTR, label=func, ## __VA_ARGS__]
+
+digraph locks {
+
+//clusterrank=none
+#if defined(FITPAGE)
+size="7.5, 10.5";
+ratio=compress;
+center=true;
+#endif
+
+subgraph long {
+	/* reiser4 long term locks */
+	RLONG(longterm_lock);
+	RLONG(inode_rw_lock);
+	RLONG(stack_sema);
+	RLONG(flush_sema);
+	RLONG(commit_sema);
+	RLONG(delete_sema);
+    /* txncommit is a synonym for flush_sema and commit_sema */
+	txncommit [LONGATTR, PSEUDOATTR]; MARKLONG(txncommit);
+	txncommit -> flush_sema [style=dotted, dir=both];
+	txncommit -> commit_sema [style=dotted, dir=both];
+
+    /* atom_event is not really a lock: you can wait on it, but cannot "own"
+       it. */
+	RCOND(atom_event,atom_event);
+
+	//RLONG(lnode_kcond);
+	//RLONG(ktxnmgrd_start);
+	//RLONG(ktxnmgrd_wait);
+	//RLONG(bnode_sema);
+
+	/* pseudo locks */
+	SYSPSEUDO(pfault);
+	SYSPSEUDO(kalloc);
+	SYSPSEUDO(schedulable);
+
+	/* system long term locks */
+	SYSLONG(page_write, page_write);
+	SYSLONG(mm_mmap_sem, "mm->mmap_sem");
+	SYSLONG(mapping_i_shared_sem, "mapping->i_shared_sem");
+
+	SYSLONG(i_sem, i_sem);
+	SYSLONG(page_lock, page_lock);
+	SYSLONG(cache_chain_sem, "&cache_chain_sem");
+	SYSLONG(I_LOCK, "I_LOCK");
+
+	SYSLONG(namespace_sem, "namespace->sem");
+	// SYSLONG(bdev_bd_sem, "bdev->bd_sem");
+	SYSLONG(sb_s_lock, "sb->s_lock");
+	SYSLONG(sb_s_umount, "sb->s_umount");
+}
+
+subgraph spin {
+
+	/* reiser4 spin locks */
+
+	RSPIN(tree_lock);
+	RSPIN(dk_lock);
+	RSPIN(jnode_lock);
+	RSPIN(inode_guard);
+	RSPIN(atom_lock);
+	RSPIN(txnh_lock);
+	RSPIN(txnmgr_lock);
+	RSPIN(ktxnmgrd_lock);
+	RSPIN(cbk_guard);
+	RSPIN(epoch_lock);
+	RSPIN(zgen_lock);
+	RSPIN(stack_lock);
+	RSPIN(zlock);
+	RSPIN(fq_lock);
+	RSPIN(jload_lock);
+	RSPIN(super_guard);
+    RSPIN(eflush_guard);
+    RSPIN(d_lock);
+
+	//RSPIN(stack_lock);
+	//RSPIN(lnode_guard);
+	//RSPIN(cksum_guard);
+	//RSPIN(oid_guard);
+	//RSPIN(test_lock);
+	//RSPIN(kcond_lock);
+	//RSPIN(fake_lock);
+	//RSPIN(panic_guard);
+	//RSPIN(contexts_lock);
+	//RSPIN(pset_guard);
+	//RSPIN(phash_guard);
+
+	/* system spin locks */
+	SYSSPIN(bkl, "BKL");
+	SYSSPIN(cachep_spinlock, "cachep->spinlock");
+	SYSSPIN(zone_lock, "zone->lock");
+	SYSSPIN(swaplock, "&swaplock");
+	SYSSPIN(zone_lru_lock, "zone->lru_lock");
+	SYSSPIN(mapping_private_lock, "mapping->private_lock");
+	SYSSPIN(mapping_page_lock, "mapping->page_lock");
+	SYSSPIN(inode_lock, "&inode_lock");
+	SYSSPIN(swap_device_lock, "swap->device_lock");
+	SYSSPIN(mm_page_table_lock, "mm->page_table_lock");
+	SYSSPIN(sb_lock, "&sb_lock");
+	SYSSPIN(page_chain_lock, "page->chain_lock");
+    //removed at 2003.04.04 by akpm@digeo.com
+	//SYSSPIN(dparent_lock, "dparent_lock");
+	SYSSPIN(dcache_lock, "dcache_lock");
+	SYSSPIN(fs_struct_lock, "fs_struct->lock");
+	SYSSPIN(tasklist_lock, "&tasklist_lock");
+	SYSSPIN(sig_siglock, "sig->siglock");
+	SYSSPIN(fown_lock, "fown->lock");
+	SYSSPIN(task_switch_lock, "task->switch_lock");
+	SYSSPIN(task_proc_lock, "task->proc_lock");
+	SYSSPIN(task_alloc_lock, "task->alloc_lock");
+	/* rq->lock is special: it can be unlocked by thread different from locker */
+	SYSSPIN(rq_lock, "rq->lock");
+	SYSSPIN(task_capability_lock, "&task_capability_lock");
+    SYSSPIN(mmlist_lock, "&mmlist_lock");
+	SYSSPIN(files_file_lock, "files->file_lock");
+	SYSSPIN(dn_lock, "&dn_lock");
+	//SYSSPIN(bdev_lock, "&bdev_lock");
+	SYSSPIN(suspend_pagedir_lock, "&suspend_pagedir_lock")
+}
+
+/* dependencies */
+
+ARC(inode_guard, tree_lock, "update_sd_at()");
+ARC(inode_guard, jnode_lock, "update_sd_at()");
+ARC(inode_guard, atom_lock, "update_sd_at()");
+ARC(atom_lock, jnode_lock, "uncapture_block()"); //capture_fuse_jnode_lists()
+ARC(jnode_lock, txnh_lock, "try_capture_block()");
+//alredy covered
+ARC(atom_lock, txnh_lock, "capture_fuse_txnh_lists()");
+ARC(jnode_lock, tree_lock, "jdrop_in_tree()");
+ARC(tree_lock, cbk_guard, "cbk_cache_invalidate()");
+ARC(dk_lock, tree_lock, "sync_dkeys()");
+ARC(txnmgr_lock, atom_lock, "atom_dec_and_unlock()"); //txnmgr_force_commit_all(),\ncommit_some_atoms(),\nflush_one_atom()");
+ARC(txnmgr_lock, jnode_lock, "atom_begin_andlock()");
+ARC(txnmgr_lock, txnh_lock, "atom_begin_andlock()");
+ARC(i_sem, inode_rw_lock, "unix_file_setattr()");//,\nunix_file_write()");
+ARC(page_lock, i_sem, "reiserfs_unpack()");
+ARC(inode_rw_lock, delete_sema, "shorten()");
+//ARC(delete_sema, txncommit, "reiser4_release_reserved()");
+ARC(flush_sema, longterm_lock, "flush_scan_left()");//,\nflush_allocate_znode_update(),\nflush_scan_formatted(),\nflush_pos_to_child_and_alloc()");
+ARC(longterm_lock, page_lock, "cbk_level_lookup()");
+ARC(commit_sema, page_lock, "submit_write()");
+ARC(pfault, mm_mmap_sem, "handle_page_fault()");
+ARC(page_lock, pfault, "extent_write_flow()");
+ARC(mm_mmap_sem, kalloc, "unix_file_readpage()");
+
+//ARC(inode_rw_lock, mm_mmap_sem, "unix_file_filemap_nopage()", style=dotted, dir=back);
+//ARC(mm_mmap_sem, kalloc, "DEAD2", style="dotted");
+ARC(kalloc, jnode_lock, "emergency_flush()");
+ARC(longterm_lock, jnode_lock, "longterm_unlock_znode()");//,\nflush_allocate_znode()");
+
+ARC(kalloc, inode_guard, "eflush_add()");
+ARC(ktxnmgrd_lock, txnmgr_lock, "commit_some_atoms()");
+
+//already covered
+ARC(mapping_i_shared_sem, mapping_private_lock, "__set_page_dirty_buffers()");
+//already covered
+ARC(mapping_i_shared_sem, mapping_page_lock, "");
+ARC(mapping_i_shared_sem, mm_page_table_lock, "vma_link()");
+
+ARC(inode_lock, mapping_page_lock, "__sync_single_inode()");
+ARC(inode_lock, sb_lock, "writeback_inodes()");
+
+ARC(mm_page_table_lock, swap_device_lock, "try_to_unmap_one()");
+ARC(mm_page_table_lock, mapping_private_lock, "try_to_unmap_one()");
+//already covered
+ARC(mm_page_table_lock, mapping_page_lock, "try_to_unmap_one()");
+
+ARC(mm_mmap_sem, mapping_i_shared_sem, "do_mmap_pgoff()");
+
+ARC(swaplock, swap_device_lock, "swap_info_get()");
+ARC(swap_device_lock, mapping_page_lock, "exclusive_swap_page()");
+
+ARC(page_lock, page_chain_lock, "shrink_list()");
+ARC(mm_page_table_lock, page_chain_lock, "page_add_rmap()");//,\npage_remove_rmap()");
+ARC(mapping_page_lock, zone_lru_lock, "add_to_page_cache()");//,\nfilemap_fdatawait()");
+ARC(mm_page_table_lock, zone_lru_lock, "page_add_rmap()");//,\npage_remove_rmap()");
+ARC(zone_lru_lock, page_chain_lock, "rmap.c");
+
+ARC(cache_chain_sem, kalloc, "cpuup_callback()");
+//ARC(cache_chain_sem, pfault, "kmem_cache_create()");
+
+//obsolete ARC(dcache_lock, dparent_lock, "d_move()");
+ARC(fs_struct_lock, dcache_lock, "set_fs_pwd()");//,\nset_fs_root()");
+
+ARC(namespace_sem, i_sem, "sys_pivot_root()");
+
+ARC(sb_s_lock, txncommit, "reiser4_write_super()");
+ARC(sb_s_umount, txncommit, "reiser4_kill_super()");
+
+ARC(task_switch_lock, rq_lock, "finish_arch_switch()");
+ARC(task_proc_lock, tasklist_lock, "unhash_process()"); // de_thread()
+ARC(task_proc_lock, dcache_lock, "proc_pid_unhash()");
+
+ARC(tasklist_lock, sig_siglock, "de_thread()");//,\ndo_notify_parent(),\nsys_tkill(),\ncopy_process()"); //collect_sigign_sigcatch(),\n__exit_sighand(),\nfreeze_processes()
+ARC(dn_lock, fown_lock, "__inode_dir_notify()");
+ARC(fown_lock, tasklist_lock, "send_sigio()");//,\nsend_sigurg()");
+ARC(tasklist_lock, task_alloc_lock, "chroot_fs_refs()");
+ARC(tasklist_lock, rq_lock, "setscheduler()");
+ARC(task_capability_lock, tasklist_lock, "sys_capget()");//,\nsys_capset()");
+ARC(task_alloc_lock, files_file_lock, "match_comm()");//,\nmatch_pid()");
+
+ARC(mmlist_lock, mm_page_table_lock, "unuse_process()");
+
+ARC(tree_lock, zone_lock, "page_clear_jnode()");//,\njrelse_nolock()");
+ARC(tree_lock, zone_lru_lock, "page_clear_jnode()");//,\njrelse_nolock()");
+ARC(tree_lock, mapping_page_lock, "jdrop_in_tree()");
+ARC(tree_lock, epoch_lock, "zget()");
+ARC(tree_lock, zgen_lock, "zget()");
+
+ARC(bkl, inode_lock, "iget()");
+
+ARC(jnode_lock, mapping_page_lock, "jnode_set_dirty()");
+ARC(jnode_lock, zone_lru_lock, "jnode_set_dirty()");
+
+ARC(I_LOCK, longterm_lock, "reiser4_iget()");
+
+//one cannot wait for atom event keeping longterm lock
+ARC(atom_event, longterm_lock, "flush");
+//one cannot wait for atom event keeping page lock
+ARC(atom_event, page_lock, "jnode_extent_write()");
+ARC(zlock, stack_lock, "longterm_lock_znode()");//,\nlongterm_unlock_znode(), wake_up_all_lopri_owners()");
+
+ARC(atom_lock, stack_lock, "check_not_fused_lock_owners()");//atom_send_event()
+ARC(txnh_lock, stack_lock, "check_not_fused_lock_owners()");
+ARC(fq_lock, stack_lock, "wakeup_atom_waitfor_list()");
+ARC(atom_lock, fq_lock, "detach_fq()");
+ARC(jnode_lock, zlock, "check_not_fused_lock_owners()");
+ARC(txnh_lock, zlock, "check_not_fused_lock_owners()");
+
+ARC(suspend_pagedir_lock, zone_lock, "do_magic_suspend_2()");
+ARC(cachep_spinlock, zone_lock, "cache_flusharray()");
+
+ARC(mapping_page_lock, zone_lock, "add_to_page_cache()"); // find_lock_page
+ARC(mapping_page_lock, zone_lru_lock, "add_to_page_cache()"); // find_lock_page
+ARC(mm_page_table_lock, zone_lock, "try_to_unmap_one()"); // get_user_pages, do_wp_page, do_anonymous_page, do_no_page
+ARC(mm_page_table_lock, zone_lru_lock, "try_to_unmap_one()"); // get_user_pages, do_wp_page, do_anonymous_page, do_no_page
+ARC(jnode_lock, zone_lock, "page_clear_jnode()"); // uncapture_page, extent_write_flow
+ARC(jnode_lock, zone_lru_lock, "page_clear_jnode()"); // uncapture_page, extent_write_flow
+ARC(jnode_lock, jload_lock, "reiser4_releasepage()");
+ARC(atom_lock, super_guard, "grabbed2flush_reserved_nolock()");
+
+ARC(jnode_lock, eflush_guard, "eflush_add()");
+ARC(inode_guard, d_lock, "detach_fsdata()");
+}
Index: linux-2.6.8.1-ck/fs/reiser4/doc/metadata-in-pagecache
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/metadata-in-pagecache	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/metadata-in-pagecache	2004-08-22 19:35:33.629651422 +1000
@@ -0,0 +1,57 @@
+Hello,
+
+In upcoming reiser4 we are planning to use page cache to store all file system
+meta data. In some cases it is straightforward; for example, bitmaps blocks,
+placed on the disk through (almost) equal intervals ask to be bound to special
+fake inode and indexed by their disk offsets.
+
+There is one important (most important actually) case where using fake inode
+is inconvenient: blocks of internal balanced tree used by reiser4, known as
+"formatted nodes". Natural solution of using block number as offset within
+some fake inode doesn't pass, because when block size is smaller than page
+some blocks mapped to the same page may be either occupied by something other
+than formatted nodes, or just be free.
+
+This leads to the following complications:
+
+ 1. we cannot simply use block_{read|write}_full_page(), because this will
+ waste IO bandwidth: block that doesn't contain formatted node will be read
+ into memory. Moreover, this block can be later read again, for example,
+ because this is data block of some file and hashed into different place in
+ the page cache, creating alias. This will definitely confuse buffer cache;
+
+ 2. even is we keep track of what blocks have to be actually read, there still
+ will be "internal memory fragmentation", because some parts of page cache
+ pages will be unused.
+
+In brief, formatted nodes form a tree and because of this don't fit into
+<inode, offset> hashing scheme---there is no linear ordering among them.
+
+Moreover, formatted node is never looked up in the page cache by its block
+number, because for each formatted node in memory there is special data
+structure (znode) and znodes are hashed in the hash table anyway.
+
+So, all functionality that we need from the page cache is memory allocator
+with attached memory pressure hooks (I guess, this is close to what Hans
+called "sub-cache" in lkml discussions on this topic).
+
+It seems that we have two solutions:
+
+ 1. change page cache to use different indexing for formatted nodes;
+
+ 2. implement our own memory allocator sitting directly on the top of
+ alloc_pages() and installing proper ->mapping for pages that it grabs.
+
+(2) will only work if generic VM code (e.g., shrink_cache() or
+page_launder_zone() in rmap VM) don't depend on particulars of page cache
+hashing, that, fortunately, seems to be the case. This approach has following
+advantages:
+
+ . we can try to collocate related blocks on the same page, for example
+ blocks from the same transaction, of block with similar cache "hotness";
+
+ . we can use blocks larger than page size.
+
+Nikita.
+
+
Index: linux-2.6.8.1-ck/fs/reiser4/doc/oid-locid
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/oid-locid	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/oid-locid	2004-08-22 19:35:33.629651422 +1000
@@ -0,0 +1,108 @@
+MIME-Version: 1.0
+Content-Type: text/plain; charset=us-ascii
+Content-Transfer-Encoding: 7bit
+Message-ID: <15392.39020.573047.826769@laputa.namesys.com>
+Date: Wed, 19 Dec 2001 16:38:52 +0300
+To: Reiserfs developers mail-list <Reiserfs-Dev@Namesys.COM>
+Subject: [RFC]: objectids and localities management
+X-Mailer: VM 6.96 under 21.4 (patch 3) "Academic Rigor" XEmacs Lucid
+FCC: ~/documents/mail/outgoing
+--text follows this line--
+Hello,
+
+there is one thing that seems awkward in current reiser{fs|4} design: in
+a key we have both locality id (locid) and object id (oid). This is
+slightly illogical because oid alone is unique, but we cannot find an
+object given oid. This was, by the way, main reason behind our NFS
+troubles. So, why is this strictly necessary? I'll try to reason from
+the "first principles". Following account doesn't pretend to be of any
+historical accuracy of course.
+
+1. In a data structure we use to store objects (tree) items
+   with close keys are packed into the same disk block. This means that
+   we cannot completely separate key allocation from block
+   allocation. That is,
+
+      - tree forces us to encode disk location preferences in a key. (A1)
+
+2. If we cannot completely separate key and block allocation let's try
+   in stead to blend them together. That is, we rely on block allocator
+   to follow tree ordering and topology: blocks containing items with
+   close keys are allocated close on disk and blocks contiguous in tree
+   order are more or less contiguous on disk. How far bitmap.c fulfill
+   or can fulfill these goals is out of the scope of this discussion,
+
+      - let's suppose that we have ideal block allocator. (A2)
+
+3. Given this, why cannot we encode disk location preferences in oid
+   alone? Because oid has to be unique and we cannot predict how many
+   objects we are going to group together in a future (how many objects
+   there will be in a directory that is). That is, suppose we create two
+   directories "a" and "b" in succession. If oid were the only thing to
+   store location preference, than we should leave after the oid of "a"
+   enough unused oids for all objects within "a", but we don't know how
+   many of them will be there.
+
+4. To solve this (locid, oid) scheme was born. It has following
+   advantages:
+
+      - it is simple to implement
+      - it allows one to encode enough location preference into the key (A3)
+
+But the more people used reiserfs and the more files they started to
+store in a single directory, the less valid (A3) became. oid became
+inadequate location preference, because while it allows to separate
+files from different directories it doesn't allow to order files within
+single directory. For example readdir of big directory is slow, because
+files are not sorted within directory. Various ad-hoc solutions have
+been proposed (oid==hash, add "band" to oid, etc), but there is obvious
+conflict between requirement that oid is unique and desire to encode
+additional information in it. In effect all such solutions amount to
+further splitting of (locid,oid) pair into (locid, someid, oid) for the
+reasons similar to those on the steps 3,4 above.
+
+The scheme proposed below tries to meet following goals:
+
+ G1. only keep unique oid in a key, thus making it possible to find file
+     given its inode number and additionally shrink key, increasing
+     fanout.
+
+ G2. allow configurable amount of fine-grained locality preference
+     information to be associated with each oid, thus allowing files
+     to be ordered in a tree according to some hierarchical "packing
+     localities", for example: first order files by oid of parent
+     directory, then by hash of name within this directory.
+
+
+Proposal:
+
+Maintain separate map (oidlocmap, implementation discussed below) from
+oid to "locpref", where locpref is additional fine-grained location
+preference data, associated with oid. For example locpref may be just
+(locid) to emulate existing behavior, or (locid, hash) or (locid,
+user-supplied-grouping-info), etc.
+
+Key only contains oid, that is, ceteris paribus, key has form
+(item-type, oid, offset). If oid is 60 bits, this is 16 bytes.
+
+Ordering of items within tree (and, given (A2), their ordering on disk)
+is completely determined by keycmp() function that compares two
+keys. Before comparing two keys, keycmp(k1, k2) consults oidlocmap and
+obtains locprefs, associated with oids of k1 and k2. locprefs then are
+"pasted" into k1 and k2, producing "expanded" keys, containing full
+location preferences information. Expanded keys are compared as usual.
+
+In simplest case oidlocmap can be implemented as normal balanced tree,
+where keys are oids (60 bits) and values locprefs. If we limit ourselves
+to fixed format of locpref (at least per file system) than, we get
+standard text-book balanced tree storing values of fixed size which is
+simple to implement.
+
+There is of course overhead of maintaining oidlocmap and, especially, of
+consulting it on each keycmp(), but it looks to me that it will be not
+that significant, because oidlocmap is compact and will be out-weighted
+by increased fanout in the main tree.
+
+Comments?
+
+Nikita.
Index: linux-2.6.8.1-ck/fs/reiser4/doc/page-cache-for-formatted-nodes
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/page-cache-for-formatted-nodes	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/page-cache-for-formatted-nodes	2004-08-22 19:35:33.630651263 +1000
@@ -0,0 +1,60 @@
+PROPOSAL:
+
+Keep formatted nodes in a page cache, binding them to the special fake inode
+and using block number divided by number of blocks in a page as page index.
+
+ADVANTAGES:
+
+Page cache is preferred over buffer cache. Much more optimization and
+scalability efforts are going into it. The fewer separate caches are in the
+system, the simpler and better VM can handle load.
+
+DISADVANTAGES:
+
+As formatted nodes are indexed by block number, each page will contain
+blocks with consequentive block numbers. This poses several problems:
+
+  1. When we need to read particular block from the disk (e.g., to load child
+  node during tree lookup), it is not clear that blocks with neighboring block
+  numbers are worth reading into memory at all.
+
+  2. Some of the blocks that have to go in the same page as block we need can
+  be unformatted ones.
+
+SOLUTIONS:
+
+There are several possible workarounds:
+
+  1. rely on the fact that in vast majority of cases block size is equal to
+  the page size. So, we can index formatted nodes by block number storing
+  exactly one block in the page. This will eliminate both problems at the
+  expense of the memory wasting in the setups where block size is smaller than
+  page size.
+
+  2. only load required block in the page marking other blocks mapped to this
+  page as up-to-date. It is not obvious that this will work at all, and in any
+  case, this will force us to use special API to access such pages, bypassing
+  VM interface.
+
+  3. rely on good repacker and load all blocks in the page hoping that they
+  are close to each other in tree order and will be accessed shortly.
+
+  4. allocate unformatted nodes such that they will never go into the same
+  frame as formatted. For example:
+
+    - always align extent to the page boundary on the disk (page is CPU
+    specific though);
+
+    - use some variation of border algorithm to separate formatted and
+    unformatted nodes;
+
+    - use "enriched" bitmap where formatted and unformatted nodes are
+    distinguishable.
+
+
+# Local variables:
+# mode-name: "proposal"
+# indent-tabs-mode: nil
+# tab-width: 4
+# eval: (if (fboundp 'flyspell-mode) (flyspell-mode))
+# End:
Index: linux-2.6.8.1-ck/fs/reiser4/doc/plugin.inheritance
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/plugin.inheritance	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/plugin.inheritance	2004-08-22 19:35:33.630651263 +1000
@@ -0,0 +1,119 @@
+
+				 Report about "plugin inheritance discussion"
+
+    1. Basic plugin support, psets, default plugins.
+
+    2. Plugin inheritance.
+
+    3. Meta-data inheritance, light-weight files.
+
+1. Basic plugin support, psets, default plugins.
+
+    Let's call Reiser4 file system object "active" when it is used by the
+    kernel, that is, when initialized inode exists for it. Associated with
+    each active object is its "plugin set" ("pset" for short) that is an array
+    of all plugins necessary for proper interaction with this object. Pointer
+    to pset is stored in inode. Pset is constructed when:
+
+        1. new object is created, or
+
+        2. existing object is looked up.
+
+    New object is always created as a child of some already existing
+    object. During object creation its pset is constructed on the basic of
+    parent's one---this is plugin inheritance. Details of plugin inheritance
+    are delegated to the object plugin of new object for flexibility.
+
+    File system has "default pset". In current implementation it is just pset
+    of the root directory, created by mkfs.reiser4.
+
+    When stat-data is saved to disk, pset is saved as part of stat-data. At
+    least this is what default static stat-data plugin does. More advanced
+    stat-data plugins are free to save psets separately, implement sharing,
+    etc.
+
+    As an optimization, only plugins different from default ones are stored in
+    stat-data. Correspondingly, when object is looked up, plugins found in
+    stat-data are installed into pset, and missing plugins are taken from the
+    default pset.
+
+    Plugins in pset can be divided into two types:
+
+        1. "essential"---ones that cannot be changed without some explicit
+        effort. For example, hash and fibration plugins are essential, because
+        changing them would render directory content invalid.
+
+        2. "non-essential"---plugins that can be changed implicitly. For
+        example, security plugin and formatting-policy plugin are
+        non-essential.
+
+    From previous description it is clear that essential plugins in default
+    pset cannot be modified once file system was created, because this would
+    implicitly change plugins of all objects in whose stat-data appropriate
+    plugin is missing, which is contrary to the definition of essential
+    plugin.
+
+    This poses a problem: what to do when new member is added to pset
+    (consider recent addition of fibration plugin)? And, conversely, what to
+    do when mounting a file system with unknown member in default pset?
+
+    The former is only an issue for essential plugins. When new essential
+    plugin is added to pset, backward-compatible implementation of this plugin
+    should be provided as default. That is, for example, when kernel with
+    support for fibration mounts file system without fibration plugin it the
+    root-directory stat-data, "lexicographic" fibration plugin should be
+    used. This guarantees that old file-systems can be used without corrupting
+    them. Of course, new versions of mkfs.reiser4 can set up whatever
+    fibration plugin is deemed best to be default.
+
+    "Forward-compatibility" that is, mounting a file system with
+    unknown plugin in default pset, can be simply refused.
+
+2. Plugin inheritance.
+
+    In addition to pset each active object also has a "hset"---"heir
+    set". When new child is created, it first tries to inherit plugins from
+    parent's hset, and only if plugin is missing there---from parent's
+    pset. hset is treated exactly like pset in all other respects. NOTE:
+    storing hset on disk is not yet implemented.
+
+    One question still remains to be answered: how object plugin of a child
+    being created is selected? One possible solution is to add two new members
+    PSET_CREAT, and PSET_MKDIR to the pset. They specify object plugins used
+    when child is being created through sys_creat() and sys_mkdir() system
+    calls. (Other system calls, such as sys_symlink() and sys_mknod() are too
+    specialized for such flexibility.) NOTE: this is also not yet implemented.
+
+3. Meta-data inheritance, light-weight files.
+
+    Through meta-data inheritance file system object can somehow indicate that
+    some portion of its meta-data should be taken from some place other than
+    object's stat-data. Three obvious scenarios for meta-data inheritance are:
+
+        1. meta-data are taken from file-system level default place,
+
+        2. meta-data are taken from some specially indicated place (i.e.,
+        stat-data contains a key of item(s) where meta-data have to be taken
+        from), and
+
+        3. meta-data are taken from the parent.
+
+    Note, that the last option is ambiguous, because the notion of _the_
+    parent is not well-defined in general. This can be worked around in two
+    ways:
+
+        1. only use it when there is _the_ parent, for example, disable
+        light-weight files with multiple names, or
+
+        2. don't care, for example, allow uid of light-weight file to depend
+        on path-name through which this file was reached.
+
+    In any case, meta-data inheritance can be implemented by re-using existing
+    static stat-data item plugin with simple additional plumbing in the kernel
+    code (pointer to parent inode should be passed to the stat-data
+    methods). It is not clear what to do when light-weight file is accessed
+    through NFS, and there is no parent. Simplest solution is to just disable
+    NFS access to them. This is trivial, because our ->{en,de}code_fh()
+    methods are delegated to object plugin.
+
+
Index: linux-2.6.8.1-ck/fs/reiser4/doc/readdir-problems-and-implementations
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/readdir-problems-and-implementations	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/readdir-problems-and-implementations	2004-08-22 19:35:33.631651104 +1000
@@ -0,0 +1,12 @@
+1.
+
+User level API.
+
+Standard
+
+^ Local variables:
+^ mode-name: "Design Document"
+^ indent-tabs-mode: nil
+^ tab-width: 4
+^ eval: (if (fboundp 'flyspell-mode) (flyspell-mode))
+^ End:
Index: linux-2.6.8.1-ck/fs/reiser4/doc/reiser4.writeback.overview
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/reiser4.writeback.overview	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/reiser4.writeback.overview	2004-08-22 19:35:33.631651104 +1000
@@ -0,0 +1,68 @@
+Hello,
+
+reiser4 has some features that make it somewhat difficult to integrate with
+existing VM mechanisms.
+
+Reiser4 maintains all meta data in the single balanced tree. This tree is
+maintained in the memory in the form different from what will be ultimately
+written to the disk. Roughly speaking, before writing tree node to the disk,
+some complex process ("flush") is to be performed. This process has following
+characteristics:
+
+ 1 it is not local, that is it operates on big number of nodes, possibly far
+   away from the starting node, both in tree and disk order.
+
+ 2 it can involve reading of the large number of nodes from the disk (for
+   example, bitmap nodes are read during extent allocation that is deferred
+   until flush).
+
+ 3 it can allocate unbounded amount of memory (during insertion of allocated
+   extents).
+
+ 4 it participates in the locking protocol which reiser4 uses to implement
+   concurrent tree modifications.
+
+ 5 it is CPU consuming and long
+
+As a result, flush reorganizes some part of reiser4 tree and produces large
+queue of nodes ready to be submitted for io (as a matter of fact, flush write
+clustering is so good that it used to hit BIO_MAX_PAGES all the time, until
+checks were added for this).
+
+Items (3) and (4) alone make flush unsuitable for being called directly from
+reiser4 ->vm_writeback() callback, because of OOM and deadlocks against
+threads waiting for memory.
+
+So, it was decided that flush has to be performed from the separate
+thread. Reiser4 has thread used to periodically commit old transactions and
+this thread can be used for the flushing. That is, flushing thread does flush
+and accumulates nodes prepared for the IO on the special
+queue. reiser4_vm_writeback() submits nodes from this queue, if queue is
+empty, it only wakes up flushing thread and immediately returns.
+
+Still there are some problems with integrating this stuff into VM scanning:
+
+ 1 As ->vm_writeback() returns immediately without actually submitting pages
+   for IO, throttling on PG_writeback in shrink_list() will not work. This
+   opens a possibility (on a fast CPU), of try_to_free_pages() completing
+   scanning and calling out_of_memory() before flushing thread managed to add
+   anything to the queue.
+
+ 2 It is possible, however unlikely, that flushing thread will be unable to flush
+   anything, because there is not enough memory. In this case reiser4 resorts
+   to the "emergency flush": some dumb algorithm that writes tree nodes to the
+   disk without taking locks and without optimizing tree layout.
+
+ 3 Nodes prepared for IO can be from the active list, this means that they
+   will not be met/freed by shrink_list() after IO completion. New
+   blk_congestion_wait() should help here though.
+
+It looks like we need following changes to make this stuff working:
+
+ 1 Adding ->priority field into struct writeback_control, so that file system
+   can vary its behavior depending on how desperate memory pressure is.
+
+ 2 Different mechanism for scan throttling.
+
+Actually latter can be implemented completely within reiser4 but with some
+awkwardness.
Index: linux-2.6.8.1-ck/fs/reiser4/doc/set-theoretic-stuff.tex
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/set-theoretic-stuff.tex	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/set-theoretic-stuff.tex	2004-08-22 19:35:33.632650944 +1000
@@ -0,0 +1,82 @@
+\documentclass[a4paper, oneside, fleqn]{article}
+
+\usepackage{latexsym}
+\usepackage{url}
+\usepackage[T2A]{fontenc}
+
+\pagestyle{empty}
+\listfiles
+\setcounter{errorcontextlines}{100}
+\makeindex
+\pagestyle{headings}
+\frenchspacing
+\tolerance=1000
+\parindent=0pt
+\raggedbottom
+\setlength\parskip{6pt}
+
+\DeclareMathAlphabet{\mathbsf}{T2A}{cmss}{b}{n}
+\SetMathAlphabet{\mathbsf}{normal}{T2A}{cmss}{b}{n}
+
+\def\qopname@#1{\mathop{\fam 0#1}\nolimits}
+\newcommand{\mathsign}[1]
+	{\index{#1@$\mathbsf{#1}$}\qopname@{\mathbsf{#1}}}
+
+\def\As{\mathsign{Assoc}}
+\newcommand{\svi}[2]
+    {\texttt{[} #1 \ V \texttt{]}}
+
+\begin{document}
+
+\thispagestyle{empty}
+
+%\section{Definitions}
+
+We have a set $X$ of objects, and ``associated-with'' relation. We shall write
+
+$$a\As b, \quad a\in X, \ b\in X$$
+
+to denote that $a$ is associated with $b$.
+
+One can imagine $\As$ relation as graph where elements of $X$ are nodes and
+where there is arc (arrow) from $a$ to $b$ iff $a$ is associated with
+$b$. Note that no further restrictions are placed on $\As$. In particular, it
+is not supposed that $\As$ is reflexive (object is not necessary associated
+with itself), symmetric, or transitive.
+
+$\beta(X)$ is set of all subsets of $X$, that is $$\beta(X) = \{ U \subseteq X
+\}$$
+
+Let's define function $A:X\to^{}\beta(X)$ as follows:
+
+$$A(x)=\{y\in X\ |\ y\As x\}, \quad x\in X.$$
+
+that is $A(x)$ is a set of all objects in $X$ associated with $x$.
+Then, define \mbox{$A^*:\beta(X)\to^{}\beta(X)$} as follows:
+
+$$A^*(U)=\bigcup\limits_{x\in U} A(x), \quad U\subseteq X.$$
+
+that is, $A(U)$ is set of all objects associated with any element of $U$. Now
+we can define $\svi{U}{V}$, where $U, V\subseteq X$---``set vicinity
+intersection'' operation as:
+
+%\begin{displaymath}
+%A^+(U) = \left\{
+%    \begin{array}{rl}
+%    U = \{x\}      & \Rightarrow A(x),\\
+%    \textrm{else}  & \Rightarrow A^*(U)
+%    \end{array} \right.
+%\end{displaymath}
+
+$$\svi{U}{V} = A^*(U) \cap A^*(V).$$
+
+In other words, $\svi{U}{V}$ is a set of all objects associated with some
+element of $U$ \emph{and} some element of $V$.
+
+\end{document}
+
+% Local variables:
+% indent-tabs-mode: nil
+% tab-width: 4
+% eval: (progn (if (fboundp 'flyspell-mode) (flyspell-mode)) (set (make-local-variable 'compile-command) "latex set-theoretic-stuff.tex ; dvips -o set-theoretic-stuff.ps set-theoretic-stuff.dvi"))
+% End:
Index: linux-2.6.8.1-ck/fs/reiser4/doc/sys-reiser4-implemenation-overview
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/doc/sys-reiser4-implemenation-overview	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/doc/sys-reiser4-implemenation-overview	2004-08-22 19:35:33.632650944 +1000
@@ -0,0 +1,222 @@
+SYS_REISER4 IMPLEMENTATION OVERVIEW
+
+
+A. Basics
+*****************************************************************
+
+sys_reiser4() system call executing a sequence of actions upon the
+file-system(s). Actions are specified by the user in the form of a command
+string. For the purposes of present discussion, said command string can be
+thought of as a program in a special purpose programming language, which will
+be further referred to as reiser4_lang.
+
+Canonical example of reiser4_lang program is
+
+/dir1/dir2/dir3/file1 <- /dir4/dir5/dir6/file2
+
+It semantics is following:
+
+1. resolve "/dir1/dir2/dir3/file1" into file-system object (lookup operation)
+2. resolve "/dir4/dir5/dir6/file2" into file-system object (lookup operation)
+3. assign latter to the former.
+
+This is "assignment" operator. Assignment involves two "file-system objects"
+and semantics of both lookup stage and assignment proper depends upon the type
+of the file-system object.
+
+Following types of file-system objects are recognized:
+
+1. foreign objects: objects of different file-systems. Foreign object cannot
+be target or source of an assignment. Rather, foreign objects can only appear
+during path name lookup, while traversing non-reiser4 part of the file-system
+name-space. Probably one should distinguish between objects belonging to
+different file-system types (etx2, NFS) and objects belonging to different
+reiser4 mounts. After sys_reiser4() is stable, foreign objects will be more
+fully supported.
+
+2. reiser4 objects.
+
+3. pseudo-objects: these are entities injected into reiser4 name-space to
+provide uniform access to various file-system meta-data. Pseudo-objects are
+(usually) attached to some particular "host" object. [In the initial version,]
+host objects are reiser4 objects. [Later it is possible to implement some
+pseudo-objects for foreign objects.] Convention (but not enforced rule) is
+that pseudo-objects are accessible through names starting with some well-known
+prefix (".." is current favorite). Examples: ..owner, ..acl, etc. See comment
+at the top of fs/reiser4/plugin/pseudo/pseudo.c for more details.
+
+B. lnodes
+*****************************************************************
+
+lnodes are handles for file-system objects described above. They serve dual
+purpose:
+
+1. uniform interface to the various types of objects. This allows the
+reiser4_lang implementation to treat various types of objects in the same
+manner. When new type of object has to be added, all changes will be grouped
+in one place, rather than scattered across various files. This uniformity also
+allows code sharing between reiser4_lang and VFS access paths. For example,
+the same ->write method can be used by both. That is, ->read(), and ->write()
+plugin methods used in VFS access paths will take lnode(s) as arguments and
+can share code with sys_reiser4() implementation. For example, assignment is
+particular case of write (or visa versa, depending on point of view).
+
+
+2. synchronization. reiser4_lang doesn't use inodes and this poses a problem of
+synchronization with VFS. Each lnode serves as a lock. See lnode.c for more
+details.
+
+C. lookup
+*****************************************************************
+
+reiser4_lang still supports only two traditional UNIX kinds of ordered names
+(pathnames): absolute and relative to the current working directory. In both
+cases, lookup starts from some file-system object represented by lnode. Then
+lookup proceeds component-by-component as follows:
+
+   lnode *parent;
+   lnode  child;
+
+   ret_code = lnode_get_dir_plugin( parent ) -> lnode_by_name( parent,
+                                                               path_component,
+                                                               &child );
+
+1. Abovementioned locking issues require that parent lnode has to be kept
+until operation on child finishes. In effect we get lock-coupling much like in
+internal tree traversal. Also, possibility to use lock on node with directory
+entry in stead of object lock was discussed. We have to think more on this.
+
+
+2. Mount points crossing. It is possible, because dentries and therefore
+inodes of all mount points are pinned in memory and lookup code can check at
+each step whether mount point is crossed. Details are not very nice, because
+for each inode in a path we have to scan list of all its dentries and check
+whether correct one (corresponding to our path) is mount point.
+
+3. It is also possible to pass ->lnode_by_name the whole of the remaining
+name, and let it decide how much of it it should handle. This will complicate
+locking somewhat. But this is doable, though requires changes to the parser.
+
+
+D. assignment
+*****************************************************************
+
+Assignment A<-B basically means duplicating content of B into A. No
+copy-on-write optimizations will be in version 4.0.
+
+Assignment implementation is based on the notion of flow (flow_t). Flow is a
+source from which data can be obtained. Flow can be "backed up" by one of the
+following:
+
+1. memory area in user space. (char *area, size_t length)
+2. memory area in kernel space. (caddr_t *area, size_t length)
+3. file-system object (lnode *obj, loff_t offset, size_t length)
+
+Main function to manipulate flows is:
+
+int flow_place( flow_t *flow, char *area, size_t length );
+
+it copies @length bytes of @flow into @area and updated @flow correspondingly.
+Behavior of flow_place() depends on the type of entity backing up @flow. If
+@flow is based on the kernel-space area, memmove() is used to copy data. If
+@flow is based on the user-space area, copy_from_user() is used. If @flow is
+based on file-system object, flow_place() loads object's data into page cache
+and copies them into @area.
+
+Thus, assignment code looks like following:
+
+typedef int ( *connect_t )( sink_t *target, flow_t *source );
+
+int reiser4_assign( lnode *dst, lnode *src )
+{
+    flow_t        source;
+    sink_t        target;
+    int           ret_code;
+    file_plugin  *src_fplug;
+    file_plugin  *dst_fplug;
+    connect_t     connection;
+
+    /* get plugins */
+
+    src_fplug = lnode_get_file_plugin( src );
+    dst_fplug = lnode_get_file_plugin( dst );
+
+    /* build source flow */
+    ret_code = src_fplug -> build_flow( src, &source, 0 /* offset */ );
+
+    /* build target sink */
+    ret_code = dst_fplug -> build_sink( dst, &target, 0 /* offset */ );
+
+    /*
+     * select how to transfer data from @src to @dst.
+     *
+     * Default implementation of this is common_transfer() (see below).
+     *
+     * Smart file plugin can choose connection based on type of @dst.
+     *
+     */
+    connection = src_fplug -> select_connection( src, dst );
+
+    /* do transfer */
+    return connection( &target, &source );
+}
+
+
+/* look to chain conversion of (lnode * dst) -> (sink_t target) -> (lnode * dst)
+ I think, functions build_sink(...) and  sink_object(...) - superfluous */
+
+int common_transfer( sink_t *target, flow_t *source )
+{
+    lnode  *dst;
+
+    dst = sink_object( target );
+    while( flow_not_empty( source ) ) {
+        char   *area;
+        size_t  length;
+
+        /*
+         * append some space to @target. Reasonable implementation will
+         * allocate several pagesful here
+         */
+        ret_code = lnode_get_body_plugin( dst ) -> prepare_append( dst,
+                                                                   &area,
+                                                                   &length );
+                                            /* why @length not depended from source? */
+        /*
+         * put data from flow into newly alloted space. This also updates
+         * @flow.
+         */
+        flow_place( source, area, length );
+        /*
+         * perform necessary post-write activity required by @dst plugin, like
+         * encryption, compression, etc. Release pages.
+         */
+        ret_code = lnode_get_body_plugin( dst ) -> commit_append( dst,
+                                                                  area, length );
+    }
+}
+
+
+E. parsing
+*****************************************************************
+
+It is not clear what parts of reiser4_lang processing should go into
+kernel. In any case, providing direct system call as main (or, worse, the
+only) way to access reiser4_lang functionality bounds as to maintain binary
+compatibility in a future. To avoid this, reiser4 should be shipped with
+user-level library, containing
+
+int reiser4( const char *cmd, size_t length );
+
+function. For now, this function will directly despatch @cmd to the
+sys_reiser4() in a future, it may do parsing itself and pass parse tree to the
+kernel interpreter.
+
+*****************************************************************
+
+# Local variables:
+# mode-name: "proposal"
+# indent-tabs-mode: nil
+# tab-width: 4
+# eval: (if (fboundp 'flyspell-mode) (flyspell-mode))
+# End:
Index: linux-2.6.8.1-ck/fs/reiser4/dscale.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/dscale.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/dscale.c	2004-08-22 19:35:33.633650785 +1000
@@ -0,0 +1,173 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Scalable on-disk integers */
+
+/*
+ * Various on-disk structures contain integer-like structures. Stat-data
+ * contain [yes, "data" is plural, check the dictionary] file size, link
+ * count; extent unit contains extent width etc. To accommodate for general
+ * case enough space is reserved to keep largest possible value. 64 bits in
+ * all cases above. But in overwhelming majority of cases numbers actually
+ * stored in these fields will be comparatively small and reserving 8 bytes is
+ * a waste of precious disk bandwidth.
+ *
+ * Scalable integers are one way to solve this problem. dscale_write()
+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
+ * depending on the magnitude of the value supplied. dscale_read() reads value
+ * previously stored by dscale_write().
+ *
+ * dscale_write() produces format not completely unlike of UTF: two highest
+ * bits of the first byte are used to store "tag". One of 4 possible tag
+ * values is chosen depending on the number being encoded:
+ *
+ *           0 ... 0x3f               => 0           [table 1]
+ *        0x40 ... 0x3fff             => 1
+ *      0x4000 ... 0x3fffffff         => 2
+ *  0x40000000 ... 0xffffffffffffffff => 3
+ *
+ * (see dscale_range() function)
+ *
+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
+ * to be stored, so in this case there is no place in the first byte to store
+ * tag. For such values tag is stored in an extra 9th byte.
+ *
+ * As _highest_ bits are used for the test (which is natural) scaled integers
+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
+ * uses LITTLE-ENDIAN.
+ *
+ */
+
+#include "debug.h"
+#include "dscale.h"
+
+/* return tag of scaled integer stored at @address */
+static int gettag(const unsigned char *address)
+{
+	/* tag is stored in two highest bits */
+	return (*address) >> 6;
+}
+
+/* clear tag from value. Clear tag embedded into @value. */
+static void cleartag(__u64 *value, int tag)
+{
+	/*
+	 * W-w-what ?!
+	 *
+	 * Actually, this is rather simple: @value passed here was read by
+	 * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
+	 * zeroes. Tag is still stored in the highest (arithmetically)
+	 * non-zero bits of @value, but relative position of tag within __u64
+	 * depends on @tag.
+	 *
+	 * For example if @tag is 0, it's stored 2 highest bits of lowest
+	 * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
+	 *
+	 * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
+	 * and it's offset if (2 * 8) - 2 == 14 bits.
+	 *
+	 * See table 1 above for details.
+	 *
+	 * All these cases are captured by the formula:
+	 */
+	*value &= ~(3 << (((1 << tag) << 3) - 2));
+	/*
+	 * That is, clear two (3 == 0t11) bits at the offset
+	 *
+	 *                  8 * (2 ^ tag) - 2,
+	 *
+	 * that is, two highest bits of (2 ^ tag)-th byte of @value.
+	 */
+}
+
+/* return tag for @value. See table 1 above for details. */
+static int dscale_range(__u64 value)
+{
+	if (value > 0x3fffffff)
+		return 3;
+	if (value > 0x3fff)
+		return 2;
+	if (value > 0x3f)
+		return 1;
+	return 0;
+}
+
+/* restore value stored at @adderss by dscale_write() and return number of
+ * bytes consumed */
+reiser4_internal int dscale_read(unsigned char *address, __u64 *value)
+{
+	int tag;
+
+	/* read tag */
+	tag = gettag(address);
+	switch (tag) {
+	case 3:
+		/* In this case tag is stored in an extra byte, skip this byte
+		 * and decode value stored in the next 8 bytes.*/
+		*value = __be64_to_cpu(get_unaligned((__u64 *)(address + 1)));
+		/* worst case: 8 bytes for value itself plus one byte for
+		 * tag. */
+		return 9;
+	case 0:
+		*value = get_unaligned(address);
+		break;
+	case 1:
+		*value = __be16_to_cpu(get_unaligned((__u16 *)address));
+		break;
+	case 2:
+		*value = __be32_to_cpu(get_unaligned((__u32 *)address));
+		break;
+	default:
+		return RETERR(-EIO);
+	}
+	/* clear tag embedded into @value */
+	cleartag(value, tag);
+	/* number of bytes consumed is (2 ^ tag)---see table 1.*/
+	return 1 << tag;
+}
+
+/* store @value at @address and return number of bytes consumed */
+reiser4_internal int dscale_write(unsigned char *address, __u64 value)
+{
+	int tag;
+	int shift;
+	unsigned char *valarr;
+
+	tag = dscale_range(value);
+	value = __cpu_to_be64(value);
+	valarr = (unsigned char *)&value;
+	shift = (tag == 3) ? 1 : 0;
+	memcpy(address + shift, valarr + sizeof value - (1 << tag), 1 << tag);
+	*address |= (tag << 6);
+	return shift + (1 << tag);
+}
+
+/* number of bytes required to store @value */
+reiser4_internal int dscale_bytes(__u64 value)
+{
+	int bytes;
+
+	bytes = 1 << dscale_range(value);
+	if (bytes == 8)
+		++ bytes;
+	return bytes;
+}
+
+/* returns true if @value and @other require the same number of bytes to be
+ * stored. Used by detect when data structure (like stat-data) has to be
+ * expanded or contracted. */
+reiser4_internal int dscale_fit(__u64 value, __u64 other)
+{
+	return dscale_range(value) == dscale_range(other);
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/dscale.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/dscale.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/dscale.h	2004-08-22 19:35:33.633650785 +1000
@@ -0,0 +1,27 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Scalable on-disk integers. See dscale.h for details. */
+
+#if !defined( __FS_REISER4_DSCALE_H__ )
+#define __FS_REISER4_DSCALE_H__
+
+#include "dformat.h"
+
+extern int dscale_read (unsigned char *address, __u64 *value);
+extern int dscale_write(unsigned char *address, __u64 value);
+extern int dscale_bytes(__u64 value);
+extern int dscale_fit  (__u64 value, __u64 other);
+
+/* __FS_REISER4_DSCALE_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/emergency_flush.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/emergency_flush.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/emergency_flush.c	2004-08-22 19:35:33.635650466 +1000
@@ -0,0 +1,917 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* This file exists only until VM gets fixed to reserve pages properly, which
+ * might or might not be very political. */
+
+/* Implementation of emergency flush. */
+
+/* OVERVIEW:
+
+     Before writing a node to the disk, some complex process (flush.[ch]) is
+     to be performed. Flush is the main necessary preliminary step before
+     writing pages back to the disk, but it has some characteristics that make
+     it completely different from traditional ->writepage():
+
+        1 It operates on a large number of nodes, possibly far away from the
+        starting node, both in tree and disk order.
+
+        2 it can involve reading of nodes from the disk (during extent
+        allocation, for example).
+
+        3 it can allocate memory (during insertion of allocated extents).
+
+        4 it participates in the locking protocol which reiser4 uses to
+        implement concurrent tree modifications.
+
+        5 it is CPU consuming and long
+
+     As a result, flush reorganizes some part of reiser4 tree and produces
+     large queue of nodes ready to be submitted for io.
+
+     Items (3) and (4) alone make flush unsuitable for being called directly
+     from reiser4 ->writepage() callback, because of OOM and deadlocks
+     against threads waiting for memory.
+
+     So, flush is performed from within balance_dirty_page() path when dirty
+     pages are generated. If balance_dirty_page() fails to throttle writers
+     and page replacement finds dirty page on the inactive list, we resort to
+     "emergency flush" in our ->vm_writeback().
+
+     Emergency flush is relatively dumb algorithm, implemented in this file,
+     that tries to write tree nodes to the disk without taking locks and without
+     thoroughly optimizing tree layout. We only want to call emergency flush in
+     desperate situations, because it is going to produce sub-optimal disk
+     layouts.
+
+  DETAILED DESCRIPTION
+
+     Emergency flush (eflush) is designed to work as low level mechanism with
+     no or little impact on the rest of (already too complex) code.
+
+     eflush is initiated from ->writepage() method called by VM on memory
+     pressure. It is supposed that ->writepage() is rare call path, because
+     balance_dirty_pages() throttles writes and tries to keep memory in
+     balance.
+
+     eflush main entry point (emergency_flush()) checks whether jnode is
+     eligible for emergency flushing. Check is performed by flushable()
+     function which see for details. After successful check, new block number
+     ("emergency block") is allocated and io is initiated to write jnode
+     content to that block.
+
+     After io is finished, jnode will be cleaned and VM will be able to free
+     page through call to ->releasepage().
+
+     emergency_flush() also contains special case invoked when it is possible
+     to avoid allocation of new node.
+
+     Node selected for eflush is marked (by JNODE_EFLUSH bit in ->flags field)
+     and added to the special hash table of all eflushed nodes. This table
+     doesn't have linkage within each jnode, as this would waste memory in
+     assumption that eflush is rare. In stead new small memory object
+     (eflush_node_t) is allocated that contains pointer to jnode, emergency
+     block number, and is inserted into hash table. Per super block counter of
+     eflushed nodes is incremented. See section [INODE HANDLING] below for
+     more on this.
+
+     It should be noted that emergency flush may allocate memory and wait for
+     io completion (bitmap read).
+
+     Basically eflushed node has following distinctive characteristics:
+
+          (1) JNODE_EFLUSH bit is set
+
+          (2) no page
+
+          (3) there is an element in hash table, for this node
+
+          (4) node content is stored on disk in block whose number is stored
+          in the hash table element
+
+  UNFLUSH
+
+      Unflush is reverse of eflush, that is process bringing page of eflushed
+      inode back into memory.
+
+      In accordance with the policy that eflush is low level and low impact
+      mechanism, transparent to the rest of the code, unflushing is performed
+      deeply within jload_gfp() which is main function used to load and pin
+      jnode page into memory.
+
+      Specifically, if jload_gfp() determines that it is called on eflushed
+      node it gets emergency block number to start io against from the hash
+      table rather than from jnode itself. This is done in
+      jnode_get_io_block() function. After io completes, hash table element
+      for this node is removed and JNODE_EFLUSH bit is cleared.
+
+  LOCKING
+
+      The page lock is used to avoid eflush/e-unflush/jnode_get_io_block races.
+      emergency_flush() and jnode_get_io_block are called under the page lock.
+      The eflush_del() function (emergency unflush) may be called for a node w/o
+      page attached.  In that case eflush_del() allocates a page and locks it.
+
+  PROBLEMS
+
+  1. INODE HANDLING
+
+      Usually (i.e., without eflush), jnode has a page attached to it. This
+      page pins corresponding struct address_space, and, hence, inode in
+      memory. Once inode has been eflushed, its page is gone and inode can be
+      wiped out of memory by the memory pressure (prune_icache()). This leads
+      to the number of complications:
+
+           (1) jload_gfp() has to attach jnode tho the address space's radix
+           tree. This requires existence if inode.
+
+           (2) normal flush needs jnode's inode to start slum collection from
+           unformatted jnode.
+
+      (1) is really a problem, because it is too late to load inode (which
+      would lead to loading of stat data, etc.) within jload_gfp().
+
+      We, therefore, need some way to protect inode from being recycled while
+      having accessible eflushed nodes.
+
+      I'll describe old solution here so it can be compared with new one.
+
+      Original solution pinned inode by __iget() when first its node was
+      eflushed and released (through iput()) when last was unflushed. This
+      required maintenance of inode->eflushed counter in inode.
+
+      Problem arise if last name of inode is unlinked when it has eflushed
+      nodes. In this case, last iput() that leads to the removal of file is
+      iput() made by unflushing from within jload_gfp(). Obviously, calling
+      truncate, and tree traversals from jload_gfp() is not a good idea.
+
+      New solution is to pin inode in memory by adding I_EFLUSH bit to its
+      ->i_state field. This protects inode from being evicted by
+      prune_icache().
+
+  DISK SPACE ALLOCATION
+
+      This section will describe how emergency block is allocated and how
+      block counters (allocated, grabbed, etc.) are manipulated. To be done.
+
+   *****HISTORICAL SECTION****************************************************
+
+   DELAYED PARENT UPDATE
+
+     Important point of emergency flush is that update of parent is sometimes
+     delayed: we don't update parent immediately if:
+
+      1 Child was just allocated, but parent is locked. Waiting for parent
+      lock in emergency flush is impossible (deadlockable).
+
+      2 Part of extent was allocated, but parent has not enough space to
+      insert allocated extent unit. Balancing in emergency flush is
+      impossible, because it will possibly wait on locks.
+
+     When we delay update of parent node, we mark it as such (and possibly
+     also mark children to simplify delayed update later). Question: when
+     parent should be really updated?
+
+   WHERE TO WRITE PAGE INTO?
+
+
+     So, it was decided that flush has to be performed from a separate
+     thread. Reiser4 has a thread used to periodically commit old transactions,
+     and this thread can be used for the flushing. That is, flushing thread
+     does flush and accumulates nodes prepared for the IO on the special
+     queue. reiser4_vm_writeback() submits nodes from this queue, if queue is
+     empty, it only wakes up flushing thread and immediately returns.
+
+     Still there are some problems with integrating this stuff into VM
+     scanning:
+
+        1 As ->vm_writeback() returns immediately without actually submitting
+        pages for IO, throttling on PG_writeback in shrink_list() will not
+        work. This opens a possibility (on a fast CPU), of try_to_free_pages()
+        completing scanning and calling out_of_memory() before flushing thread
+        managed to add anything to the queue.
+
+        2 It is possible, however unlikely, that flushing thread will be
+        unable to flush anything, because there is not enough memory. In this
+        case reiser4 resorts to the "emergency flush": some dumb algorithm,
+        implemented in this file, that tries to write tree nodes to the disk
+        without taking locks and without thoroughly optimizing tree layout. We
+        only want to call emergency flush in desperate situations, because it
+        is going to produce sub-optimal disk layouts.
+
+        3 Nodes prepared for IO can be from the active list, this means that
+        they will not be met/freed by shrink_list() after IO completion. New
+        blk_congestion_wait() should help with throttling but not
+        freeing. This is not fatal though, because inactive list refilling
+        will ultimately get to these pages and reclaim them.
+
+   REQUIREMENTS
+
+     To make this work we need at least some hook inside VM scanning which
+     gets triggered after scanning (or scanning with particular priority)
+     failed to free pages. This is already present in the
+     mm/vmscan.c:set_shrinker() interface.
+
+     Another useful thing that we would like to have is passing scanning
+     priority down to the ->vm_writeback() that will allow file system to
+     switch to the emergency flush more gracefully.
+
+   POSSIBLE ALGORITHMS
+
+     1 Start emergency flush from ->vm_writeback after reaching some priority.
+     This allows to implement simple page based algorithm: look at the page VM
+     supplied us with and decide what to do.
+
+     2 Start emergency flush from shrinker after reaching some priority.
+     This delays emergency flush as far as possible.
+
+   *****END OF HISTORICAL SECTION**********************************************
+
+*/
+
+#include "forward.h"
+#include "debug.h"
+#include "page_cache.h"
+#include "tree.h"
+#include "jnode.h"
+#include "znode.h"
+#include "inode.h"
+#include "super.h"
+#include "block_alloc.h"
+#include "emergency_flush.h"
+
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+
+#if REISER4_USE_EFLUSH
+
+static int flushable(const jnode * node, struct page *page, int);
+static int needs_allocation(const jnode * node);
+static eflush_node_t *ef_alloc(int flags);
+static reiser4_ba_flags_t ef_block_flags(const jnode *node);
+static int ef_free_block(jnode *node, const reiser4_block_nr *blk, block_stage_t stage, eflush_node_t *ef);
+static int ef_prepare(jnode *node, reiser4_block_nr *blk, eflush_node_t **enode, reiser4_blocknr_hint *hint);
+static int eflush_add(jnode *node, reiser4_block_nr *blocknr, eflush_node_t *ef);
+
+/* slab for eflush_node_t's */
+static kmem_cache_t *eflush_slab;
+
+#define EFLUSH_START_BLOCK ((reiser4_block_nr)0)
+
+#define INC_STAT(node, counter)						\
+	reiser4_stat_inc_at_level(jnode_get_level(node), counter);
+
+/* this function exists only until VM gets fixed to reserve pages properly,
+ * which might or might not be very political. */
+/* try to flush @page to the disk
+ *
+ * Return 0 if page was successfully paged out. 1 if it is busy, error
+ * otherwise.
+ */
+reiser4_internal int
+emergency_flush(struct page *page)
+{
+	struct super_block *sb;
+	jnode *node;
+	int result;
+	assert("nikita-2721", page != NULL);
+	assert("nikita-2724", PageLocked(page));
+
+	// warning("nikita-3112", "Emergency flush. Notify Reiser@Namesys.COM");
+
+	/*
+	 * Page is locked, hence page<->jnode mapping cannot change.
+	 */
+
+	sb = page->mapping->host->i_sb;
+	node = jprivate(page);
+
+	assert("vs-1452", node != NULL);
+
+	jref(node);
+	INC_STAT(node, vm.eflush.called);
+
+	result = 0;
+	LOCK_JNODE(node);
+	/*
+	 * page was dirty and under eflush. This is (only?) possible if page
+	 * was re-dirtied through mmap(2) after eflush IO was submitted, but
+	 * before ->releasepage() freed page.
+	 */
+	eflush_del(node, 1);
+
+	LOCK_JLOAD(node);
+	if (flushable(node, page, 1)) {
+		if (needs_allocation(node)) {
+			reiser4_block_nr blk;
+			eflush_node_t *efnode;
+			reiser4_blocknr_hint hint;
+
+			blk = 0ull;
+			efnode = NULL;
+
+			/* Set JNODE_EFLUSH bit _before_ allocating a block,
+			 * that prevents flush reserved block from using here
+			 * and by a reiser4 flush process  */
+			JF_SET(node, JNODE_EFLUSH);
+
+			blocknr_hint_init(&hint);
+
+			INC_STAT(node, vm.eflush.needs_block);
+			result = ef_prepare(node, &blk, &efnode, &hint);
+			if (flushable(node, page, 0) && result == 0) {
+				assert("nikita-2759", efnode != NULL);
+				eflush_add(node, &blk, efnode);
+
+				result = page_io(page, node, WRITE,
+						 GFP_NOFS | __GFP_HIGH);
+				INC_STAT(node, vm.eflush.ok);
+			} else {
+				JF_CLR(node, JNODE_EFLUSH);
+				UNLOCK_JLOAD(node);
+				UNLOCK_JNODE(node);
+				if (blk != 0ull) {
+					ef_free_block(node, &blk,
+						      hint.block_stage, efnode);
+					kmem_cache_free(eflush_slab, efnode);
+				}
+				ON_TRACE(TRACE_EFLUSH, "failure-2\n");
+				result = 1;
+				INC_STAT(node, vm.eflush.nolonger);
+			}
+
+			blocknr_hint_done(&hint);
+		} else {
+			txn_atom *atom;
+			flush_queue_t *fq;
+
+			/* eflush without allocation temporary location for a node */
+			ON_TRACE(TRACE_EFLUSH, "flushing to relocate place: %llu..", *jnode_get_block(node));
+
+			/* get flush queue for this node */
+			result = fq_by_jnode_gfp(node, &fq, GFP_ATOMIC);
+
+			if (result)
+				return result;
+
+			atom = node->atom;
+
+			if (!flushable(node, page, 1) || needs_allocation(node) || !jnode_is_dirty(node)) {
+				ON_TRACE(TRACE_EFLUSH, "failure-3\n");
+				UNLOCK_JLOAD(node);
+				UNLOCK_JNODE(node);
+				UNLOCK_ATOM(atom);
+				fq_put(fq);
+				return 1;
+			}
+
+			/* ok, now we can flush it */
+			unlock_page(page);
+
+			queue_jnode(fq, node);
+
+			UNLOCK_JLOAD(node);
+			UNLOCK_JNODE(node);
+			UNLOCK_ATOM(atom);
+
+			result = write_fq(fq, NULL, 0);
+			if (result != 0)
+				lock_page(page);
+
+			ON_TRACE(TRACE_EFLUSH, "flushed %d blocks\n", result);
+			/* Even if we wrote nothing, We unlocked the page, so let know to the caller that page should
+			   not be unlocked again */
+			fq_put(fq);
+		}
+
+	} else {
+		UNLOCK_JLOAD(node);
+		UNLOCK_JNODE(node);
+		ON_TRACE(TRACE_EFLUSH, "failure-1\n");
+		result = 1;
+	}
+
+	jput(node);
+	return result;
+}
+
+static int
+flushable(const jnode * node, struct page *page, int check_eflush)
+{
+	assert("nikita-2725", node != NULL);
+	assert("nikita-2726", spin_jnode_is_locked(node));
+	assert("nikita-3388", spin_jload_is_locked(node));
+
+	if (jnode_is_loaded(node)) {             /* loaded */
+		INC_STAT(node, vm.eflush.loaded);
+		return 0;
+	}
+	if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { /* already pending io */
+		INC_STAT(node, vm.eflush.queued);
+		return 0;
+	}
+	if (JF_ISSET(node, JNODE_EPROTECTED)) {  /* protected from e-flush */
+		INC_STAT(node, vm.eflush.protected);
+		return 0;
+	}
+	if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
+		INC_STAT(node, vm.eflush.heard_banshee);
+		return 0;
+	}
+	if (page == NULL) {           		/* nothing to flush */
+		INC_STAT(node, vm.eflush.nopage);
+		return 0;
+	}
+	if (PageWriteback(page)) {               /* already under io */
+		INC_STAT(node, vm.eflush.writeback);
+		return 0;
+	}
+	/* don't flush bitmaps or journal records */
+	if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) {
+		INC_STAT(node, vm.eflush.bitmap);
+		return 0;
+	}
+	/* don't flush cluster pages */
+	if (jnode_is_cluster_page(node)) {
+		INC_STAT(node, vm.eflush.clustered);
+		return 0;
+	}
+	if (check_eflush && JF_ISSET(node, JNODE_EFLUSH)) {      /* already flushed */
+		INC_STAT(node, vm.eflush.eflushed);
+		return 0;
+	}
+	return 1;
+}
+
+#undef INC_STAT
+
+/* does node need allocation for eflushing? */
+static int
+needs_allocation(const jnode * node)
+{
+	return !(JF_ISSET(node, JNODE_RELOC) && !blocknr_is_fake(jnode_get_block(node)));
+}
+
+
+static inline int
+jnode_eq(jnode * const * j1, jnode * const * j2)
+{
+	assert("nikita-2733", j1 != NULL);
+	assert("nikita-2734", j2 != NULL);
+
+	return *j1 == *j2;
+}
+
+static ef_hash_table *
+get_jnode_enhash(const jnode *node)
+{
+	struct super_block *super;
+
+	assert("nikita-2739", node != NULL);
+
+	super = jnode_get_tree(node)->super;
+	return &get_super_private(super)->efhash_table;
+}
+
+static inline __u32
+jnode_hfn(ef_hash_table *table, jnode * const * j)
+{
+	__u32 val;
+
+	assert("nikita-2735", j != NULL);
+	assert("nikita-3346", IS_POW(table->_buckets));
+
+	val = (unsigned long)*j;
+	val /= sizeof(**j);
+	return val & (table->_buckets - 1);
+}
+
+
+/* The hash table definition */
+#define KMALLOC(size) vmalloc(size)
+#define KFREE(ptr, size) vfree(ptr)
+TYPE_SAFE_HASH_DEFINE(ef, eflush_node_t, jnode *, node, linkage, jnode_hfn, jnode_eq);
+#undef KFREE
+#undef KMALLOC
+
+reiser4_internal int
+eflush_init(void)
+{
+	eflush_slab = kmem_cache_create("eflush", sizeof (eflush_node_t),
+					0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (eflush_slab == NULL)
+		return RETERR(-ENOMEM);
+	else
+		return 0;
+}
+
+reiser4_internal int
+eflush_done(void)
+{
+	return kmem_cache_destroy(eflush_slab);
+}
+
+reiser4_internal int
+eflush_init_at(struct super_block *super)
+{
+	return ef_hash_init(&get_super_private(super)->efhash_table,
+			    8192,
+			    reiser4_stat(super, hashes.eflush));
+}
+
+reiser4_internal void
+eflush_done_at(struct super_block *super)
+{
+	ef_hash_done(&get_super_private(super)->efhash_table);
+}
+
+static eflush_node_t *
+ef_alloc(int flags)
+{
+	return kmem_cache_alloc(eflush_slab, flags);
+}
+
+#define EFLUSH_MAGIC 4335203
+
+static int
+eflush_add(jnode *node, reiser4_block_nr *blocknr, eflush_node_t *ef)
+{
+	reiser4_tree  *tree;
+
+	assert("nikita-2737", node != NULL);
+	assert("nikita-2738", JF_ISSET(node, JNODE_EFLUSH));
+	assert("nikita-3382", !JF_ISSET(node, JNODE_EPROTECTED));
+	assert("nikita-2765", spin_jnode_is_locked(node));
+	assert("nikita-3381", spin_jload_is_locked(node));
+
+	tree = jnode_get_tree(node);
+
+	ef->node = node;
+	ef->blocknr = *blocknr;
+	ef->hadatom = (node->atom != NULL);
+	ef->incatom = 0;
+	jref(node);
+	spin_lock_eflush(tree->super);
+	ef_hash_insert(get_jnode_enhash(node), ef);
+	ON_DEBUG(++ get_super_private(tree->super)->eflushed);
+	spin_unlock_eflush(tree->super);
+
+	if (jnode_is_unformatted(node)) {
+		struct inode  *inode;
+		reiser4_inode *info;
+
+		WLOCK_TREE(tree);
+
+		inode = mapping_jnode(node)->host;
+		info = reiser4_inode_data(inode);
+
+		if (!ef->hadatom) {
+			radix_tree_tag_set(jnode_tree_by_reiser4_inode(info),
+					   index_jnode(node), EFLUSH_TAG_ANONYMOUS);
+			ON_DEBUG(info->anonymous_eflushed ++);
+		} else {
+			radix_tree_tag_set(jnode_tree_by_reiser4_inode(info),
+					   index_jnode(node), EFLUSH_TAG_CAPTURED);
+			ON_DEBUG(info->captured_eflushed ++);
+		}
+		WUNLOCK_TREE(tree);
+		/*XXXX*/
+		inc_unfm_ef();
+	}
+
+	/* FIXME: do we need it here, if eflush add/del are protected by page lock? */
+	UNLOCK_JLOAD(node);
+
+	/*
+	 * jnode_get_atom() can possible release jnode spin lock. This
+	 * means it can only be called _after_ JNODE_EFLUSH is set, because
+	 * otherwise we would have to re-check flushable() once more. No
+	 * thanks.
+	 */
+
+	if (ef->hadatom) {
+		txn_atom *atom;
+
+		atom = jnode_get_atom(node);
+		if (atom != NULL) {
+			++ atom->flushed;
+			ef->incatom = 1;
+			UNLOCK_ATOM(atom);
+		}
+	}
+
+	UNLOCK_JNODE(node);
+	return 0;
+}
+
+/* Arrghh... cast to keep hash table code happy. */
+#define C(node) ((jnode *const *)&(node))
+
+reiser4_internal reiser4_block_nr *
+eflush_get(const jnode *node)
+{
+	eflush_node_t *ef;
+	reiser4_tree  *tree;
+
+	assert("nikita-2740", node != NULL);
+	assert("nikita-2741", JF_ISSET(node, JNODE_EFLUSH));
+	assert("nikita-2767", spin_jnode_is_locked(node));
+
+
+	tree = jnode_get_tree(node);
+	spin_lock_eflush(tree->super);
+	ef = ef_hash_find(get_jnode_enhash(node), C(node));
+	spin_unlock_eflush(tree->super);
+
+	assert("nikita-2742", ef != NULL);
+	return &ef->blocknr;
+}
+
+/* free resources taken for emergency flushing of the node */
+static void eflush_free (jnode * node)
+{
+	eflush_node_t *ef;
+	ef_hash_table *table;
+	reiser4_tree  *tree;
+	txn_atom      *atom;
+	struct inode  *inode = NULL;
+	reiser4_block_nr blk;
+
+	assert ("zam-1026", spin_jnode_is_locked(node));
+
+	table = get_jnode_enhash(node);
+	tree = jnode_get_tree(node);
+
+	spin_lock_eflush(tree->super);
+	ef = ef_hash_find(table, C(node));
+	BUG_ON(ef == NULL);
+	assert("nikita-2745", ef != NULL);
+	blk = ef->blocknr;
+	ef_hash_remove(table, ef);
+	ON_DEBUG(-- get_super_private(tree->super)->eflushed);
+	spin_unlock_eflush(tree->super);
+
+	if (ef->incatom) {
+		atom = jnode_get_atom(node);
+		assert("nikita-3311", atom != NULL);
+		-- atom->flushed;
+		UNLOCK_ATOM(atom);
+	}
+
+	assert("vs-1215", JF_ISSET(node, JNODE_EFLUSH));
+
+	if (jnode_is_unformatted(node)) {
+		reiser4_inode *info;
+
+		WLOCK_TREE(tree);
+
+		inode = mapping_jnode(node)->host;
+		info = reiser4_inode_data(inode);
+
+		/* clear e-flush specific tags from node's radix tree slot */
+		radix_tree_tag_clear(
+			jnode_tree_by_reiser4_inode(info), index_jnode(node),
+			ef->hadatom ? EFLUSH_TAG_CAPTURED : EFLUSH_TAG_ANONYMOUS);
+		ON_DEBUG(ef->hadatom ? (info->captured_eflushed --) : (info->anonymous_eflushed --));
+
+		assert("nikita-3355", ergo(jnode_tree_by_reiser4_inode(info)->rnode == NULL,
+					   (info->captured_eflushed == 0 && info->anonymous_eflushed == 0)));
+
+		WUNLOCK_TREE(tree);
+
+		/*XXXX*/
+		dec_unfm_ef();
+
+	}
+	UNLOCK_JNODE(node);
+
+#if REISER4_DEBUG
+	if (blocknr_is_fake(jnode_get_block(node)))
+		assert ("zam-817", ef->initial_stage == BLOCK_UNALLOCATED);
+	else
+		assert ("zam-818", ef->initial_stage == BLOCK_GRABBED);
+#endif
+
+	jput(node);
+
+	ef_free_block(node, &blk,
+		      blocknr_is_fake(jnode_get_block(node)) ?
+		      BLOCK_UNALLOCATED : BLOCK_GRABBED, ef);
+
+	kmem_cache_free(eflush_slab, ef);
+
+	LOCK_JNODE(node);
+}
+
+reiser4_internal void eflush_del (jnode * node, int page_locked)
+{
+        struct page * page;
+
+        assert("nikita-2743", node != NULL);
+        assert("nikita-2770", spin_jnode_is_locked(node));
+
+        if (!JF_ISSET(node, JNODE_EFLUSH))
+                return;
+
+        if (page_locked) {
+                page = jnode_page(node);
+                assert("nikita-2806", page != NULL);
+                assert("nikita-2807", PageLocked(page));
+        } else {
+                UNLOCK_JNODE(node);
+                page = jnode_get_page_locked(node, GFP_NOFS);
+                LOCK_JNODE(node);
+                if (page == NULL) {
+                        warning ("zam-1025", "eflush_del failed to get page back\n");
+                        return;
+                }
+                if (unlikely(!JF_ISSET(node, JNODE_EFLUSH)))
+                        /* race: some other thread unflushed jnode. */
+                        goto out;
+        }
+
+        if (PageWriteback(page)) {
+                UNLOCK_JNODE(node);
+                page_cache_get(page);
+                reiser4_wait_page_writeback(page);
+                page_cache_release(page);
+                LOCK_JNODE(node);
+                if (unlikely(!JF_ISSET(node, JNODE_EFLUSH)))
+                        /* race: some other thread unflushed jnode. */
+                        goto out;
+        }
+
+	if (JF_ISSET(node, JNODE_KEEPME))
+		set_page_dirty(page);
+	else
+		/*
+		 * either jnode was dirty or page was dirtied through mmap. Page's dirty
+		 * bit was cleared before io was submitted. If page is left clean, we
+		 * would have dirty jnode with clean page. Neither ->writepage() nor
+		 * ->releasepage() can free it. Re-dirty page, so ->writepage() will be
+		 * called again if necessary.
+		 */
+		set_page_dirty_internal(page, 0);
+
+        assert("nikita-2766", atomic_read(&node->x_count) > 1);
+        /* release allocated disk block and in-memory structures  */
+        eflush_free(node);
+        JF_CLR(node, JNODE_EFLUSH);
+ out:
+        if (!page_locked)
+                unlock_page(page);
+}
+
+reiser4_internal int
+emergency_unflush(jnode *node)
+{
+	int result;
+
+	assert("nikita-2778", node != NULL);
+	assert("nikita-3046", schedulable());
+
+	if (JF_ISSET(node, JNODE_EFLUSH)) {
+		result = jload(node);
+		if (result == 0) {
+			struct page *page;
+
+			assert("nikita-2777", !JF_ISSET(node, JNODE_EFLUSH));
+			page = jnode_page(node);
+			assert("nikita-2779", page != NULL);
+			wait_on_page_writeback(page);
+
+			jrelse(node);
+		}
+	} else
+		result = 0;
+	return result;
+}
+
+static reiser4_ba_flags_t
+ef_block_flags(const jnode *node)
+{
+	return jnode_is_znode(node) ? BA_FORMATTED : 0;
+}
+
+static int ef_free_block(jnode *node,
+			 const reiser4_block_nr *blk,
+			 block_stage_t stage, eflush_node_t *ef)
+{
+	int result = 0;
+
+	/* We cannot just ask block allocator to return block into flush
+	 * reserved space, because there is no current atom at this point. */
+	result = reiser4_dealloc_block(blk, stage, ef_block_flags(node));
+	if (result == 0 && stage == BLOCK_GRABBED) {
+		txn_atom *atom;
+
+		if (ef->reserve) {
+			/* further, transfer block from grabbed into flush
+			 * reserved space. */
+			LOCK_JNODE(node);
+			atom = jnode_get_atom(node);
+			assert("nikita-2785", atom != NULL);
+			grabbed2flush_reserved_nolock(atom, 1);
+			UNLOCK_ATOM(atom);
+			JF_SET(node, JNODE_FLUSH_RESERVED);
+			UNLOCK_JNODE(node);
+		} else {
+			reiser4_context * ctx = get_current_context();
+			grabbed2free(ctx, get_super_private(ctx->super),
+				     (__u64)1);
+		}
+	}
+	return result;
+}
+
+static int
+ef_prepare(jnode *node, reiser4_block_nr *blk, eflush_node_t **efnode, reiser4_blocknr_hint * hint)
+{
+	int result;
+	int usedreserve;
+
+	assert("nikita-2760", node != NULL);
+	assert("nikita-2761", blk != NULL);
+	assert("nikita-2762", efnode != NULL);
+	assert("nikita-2763", spin_jnode_is_locked(node));
+	assert("nikita-3387", spin_jload_is_locked(node));
+
+	hint->blk         = EFLUSH_START_BLOCK;
+	hint->max_dist    = 0;
+	hint->level       = jnode_get_level(node);
+	usedreserve = 0;
+	if (blocknr_is_fake(jnode_get_block(node)))
+		hint->block_stage = BLOCK_UNALLOCATED;
+	else {
+		txn_atom *atom;
+		switch (jnode_is_leaf(node)) {
+		default:
+			/* We cannot just ask block allocator to take block from
+			 * flush reserved space, because there is no current
+			 * atom at this point. */
+			atom = jnode_get_atom(node);
+			if (atom != NULL) {
+				if (JF_ISSET(node, JNODE_FLUSH_RESERVED)) {
+					usedreserve = 1;
+					flush_reserved2grabbed(atom, 1);
+					JF_CLR(node, JNODE_FLUSH_RESERVED);
+					UNLOCK_ATOM(atom);
+					break;
+				} else
+					UNLOCK_ATOM(atom);
+			}
+			/* fall through */
+			/* node->atom == NULL if page was dirtied through
+			 * mmap */
+		case 0:
+			result = reiser4_grab_space_force((__u64)1, BA_RESERVED);
+			grab_space_enable();
+			if (result) {
+				warning("nikita-3323",
+					"Cannot allocate eflush block");
+				return result;
+			}
+		}
+
+		hint->block_stage = BLOCK_GRABBED;
+	}
+
+	/* XXX protect @node from being concurrently eflushed. Otherwise,
+	 * there is a danger of underflowing block space */
+	UNLOCK_JLOAD(node);
+	UNLOCK_JNODE(node);
+
+	*efnode = ef_alloc(GFP_NOFS | __GFP_HIGH);
+	if (*efnode == NULL) {
+		result = RETERR(-ENOMEM);
+		goto out;
+	}
+
+#if REISER4_DEBUG
+	(*efnode)->initial_stage = hint->block_stage;
+#endif
+	(*efnode)->reserve = usedreserve;
+
+	result = reiser4_alloc_block(hint, blk, ef_block_flags(node));
+	if (result)
+		kmem_cache_free(eflush_slab, *efnode);
+ out:
+	LOCK_JNODE(node);
+	LOCK_JLOAD(node);
+	return result;
+}
+
+#endif /* REISER4_USE_EFLUSH */
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 80
+   LocalWords: " unflush eflushed LocalWords eflush writepage VM releasepage unflushing io "
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/emergency_flush.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/emergency_flush.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/emergency_flush.h	2004-08-22 19:35:33.635650466 +1000
@@ -0,0 +1,75 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Emergency flush */
+
+#ifndef __EMERGENCY_FLUSH_H__
+#define __EMERGENCY_FLUSH_H__
+
+#if REISER4_USE_EFLUSH
+
+#include "block_alloc.h"
+
+struct eflush_node;
+typedef struct eflush_node eflush_node_t;
+
+TYPE_SAFE_HASH_DECLARE(ef, eflush_node_t);
+
+struct eflush_node {
+	jnode           *node;
+	reiser4_block_nr blocknr;
+	ef_hash_link     linkage;
+	struct list_head inode_link; /* for per inode list of eflush nodes */
+	struct list_head inode_anon_link;
+	int              hadatom :1;
+	int              incatom :1;
+	int              reserve :1;
+#if REISER4_DEBUG
+	block_stage_t    initial_stage;
+#endif
+};
+
+int eflush_init(void);
+int eflush_done(void);
+
+extern int  eflush_init_at(struct super_block *super);
+extern void eflush_done_at(struct super_block *super);
+
+extern reiser4_block_nr *eflush_get(const jnode *node);
+extern void eflush_del(jnode *node, int page_locked);
+
+extern int emergency_flush(struct page *page);
+extern int emergency_unflush(jnode *node);
+
+/* eflushed jnodes are stored in reiser4_inode's radix tree. Eflushed jnodes may be either "captured" or
+ * "anonymous". Use existing tags to tag jnodes in reiser4_inode's tree of eflushed jnodes */
+#define EFLUSH_TAG_ANONYMOUS PAGECACHE_TAG_DIRTY
+#define EFLUSH_TAG_CAPTURED PAGECACHE_TAG_WRITEBACK
+
+#else /* REISER4_USE_EFLUSH */
+
+#define eflush_init()  (0)
+#define eflush_done()  (0)
+
+#define eflush_init_at(super) (0)
+#define eflush_done_at(super) (0)
+
+#define eflush_get(node)  NULL
+#define eflush_del(node, flag) do{}while(0)
+
+#define emergency_unflush(node) (0)
+#define emergency_flush(page) (1)
+
+#endif  /* REISER4_USE_EFLUSH */
+
+/* __EMERGENCY_FLUSH_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/entd.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/entd.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/entd.c	2004-08-22 19:35:33.636650307 +1000
@@ -0,0 +1,377 @@
+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Ent daemon. */
+
+#include "debug.h"
+#include "kcond.h"
+#include "txnmgr.h"
+#include "tree.h"
+#include "entd.h"
+#include "super.h"
+#include "context.h"
+#include "reiser4.h"
+#include "vfs_ops.h"
+#include "page_cache.h"
+
+#include <linux/sched.h>	/* struct task_struct */
+#include <linux/suspend.h>
+#include <linux/kernel.h>
+#include <linux/writeback.h>
+#include <linux/time.h>         /* INITIAL_JIFFIES */
+#include <linux/backing-dev.h>  /* bdi_write_congested */
+
+TYPE_SAFE_LIST_DEFINE(wbq, struct wbq, link);
+
+#define DEF_PRIORITY 12
+#define MAX_ENTD_ITERS 10
+#define ENTD_ASYNC_REQUESTS_LIMIT 32
+
+static void entd_flush(struct super_block *super);
+static int entd(void *arg);
+
+/*
+ * set ->comm field of end thread to make its state visible to the user level
+ */
+#define entd_set_comm(state)					\
+	snprintf(current->comm, sizeof(current->comm),	\
+	         "ent:%s%s", super->s_id, (state))
+
+/* get ent context for the @super */
+static inline entd_context *
+get_entd_context(struct super_block *super)
+{
+	return &get_super_private(super)->entd;
+}
+
+/* initialize ent thread context */
+reiser4_internal void
+init_entd_context(struct super_block *super)
+{
+	entd_context * ctx;
+
+	assert("nikita-3104", super != NULL);
+
+	ctx = get_entd_context(super);
+
+	xmemset(ctx, 0, sizeof *ctx);
+	kcond_init(&ctx->startup);
+	kcond_init(&ctx->wait);
+	init_completion(&ctx->finish);
+	spin_lock_init(&ctx->guard);
+
+	/* start ent thread.. */
+	kernel_thread(entd, super, CLONE_VM | CLONE_FS | CLONE_FILES);
+
+	spin_lock(&ctx->guard);
+	/* and wait for its initialization to finish */
+	while (ctx->tsk == NULL)
+		kcond_wait(&ctx->startup, &ctx->guard, 0);
+	spin_unlock(&ctx->guard);
+#if REISER4_DEBUG
+	flushers_list_init(&ctx->flushers_list);
+#endif
+	wbq_list_init(&ctx->wbq_list);
+}
+
+static void wakeup_wbq (entd_context * ent, struct wbq * rq)
+{
+	wbq_list_remove(rq);
+	ent->nr_synchronous_requests --;
+	rq->wbc->nr_to_write --;
+	up(&rq->sem);
+}
+
+static void wakeup_all_wbq (entd_context * ent)
+{
+	struct wbq * rq;
+
+	spin_lock(&ent->guard);
+	while (!wbq_list_empty(&ent->wbq_list)) {
+		rq = wbq_list_front(&ent->wbq_list);
+		wakeup_wbq(ent, rq);
+	}
+	spin_unlock(&ent->guard);
+}
+
+/* ent thread function */
+static int
+entd(void *arg)
+{
+	struct super_block *super;
+	struct task_struct *me;
+	entd_context       *ent;
+
+	assert("vs-1655", list_empty(&current->private_pages));
+
+	super = arg;
+	/* standard kernel thread prologue */
+	me = current;
+	/* reparent_to_init() is done by daemonize() */
+	daemonize("ent:%s", super->s_id);
+
+	/* block all signals */
+	spin_lock_irq(&me->sighand->siglock);
+	siginitsetinv(&me->blocked, 0);
+	recalc_sigpending();
+	spin_unlock_irq(&me->sighand->siglock);
+
+	/* do_fork() just copies task_struct into the new
+	   thread. ->fs_context shouldn't be copied of course. This shouldn't
+	   be a problem for the rest of the code though.
+	*/
+	me->journal_info = NULL;
+
+	ent = get_entd_context(super);
+
+	spin_lock(&ent->guard);
+	ent->tsk = me;
+	/* signal waiters that initialization is completed */
+	kcond_broadcast(&ent->startup);
+	spin_unlock(&ent->guard);
+	while (1) {
+		int result = 0;
+
+		if (me->flags & PF_FREEZE)
+			refrigerator(PF_FREEZE);
+
+		spin_lock(&ent->guard);
+
+		while (ent->nr_all_requests != 0) {
+			assert("zam-1043", ent->nr_all_requests >= ent->nr_synchronous_requests);
+			if (ent->nr_synchronous_requests != 0) {
+				struct wbq * rq = wbq_list_front(&ent->wbq_list);
+
+				if (++ rq->nr_entd_iters > MAX_ENTD_ITERS) {
+					ent->nr_all_requests --;
+					wakeup_wbq(ent, rq);
+					continue;
+				}
+			} else {
+				/* endless loop avoidance. */
+				ent->nr_all_requests --;
+			}
+
+			spin_unlock(&ent->guard);
+			entd_set_comm("!");
+			entd_flush(super);
+			spin_lock(&ent->guard);
+		}
+
+		entd_set_comm(".");
+
+		/* wait for work */
+		result = kcond_wait(&ent->wait, &ent->guard, 1);
+		if (result != -EINTR && result != 0)
+			/* some other error */
+			warning("nikita-3099", "Error: %i", result);
+
+		/* we are asked to exit */
+		if (ent->done) {
+			spin_unlock(&ent->guard);
+			break;
+		}
+
+		spin_unlock(&ent->guard);
+	}
+	wakeup_all_wbq(ent);
+	complete_and_exit(&ent->finish, 0);
+	/* not reached. */
+	return 0;
+}
+
+/* called by umount */
+reiser4_internal void
+done_entd_context(struct super_block *super)
+{
+	entd_context * ent;
+
+	assert("nikita-3103", super != NULL);
+
+	ent = get_entd_context(super);
+
+	spin_lock(&ent->guard);
+	ent->done = 1;
+	kcond_signal(&ent->wait);
+	spin_unlock(&ent->guard);
+
+	/* wait until daemon finishes */
+	wait_for_completion(&ent->finish);
+}
+
+/* called at the beginning of jnode_flush to register flusher thread with ent
+ * daemon */
+reiser4_internal void enter_flush (struct super_block * super)
+{
+	entd_context * ent;
+
+	assert ("zam-1029", super != NULL);
+	ent = get_entd_context(super);
+
+	assert ("zam-1030", ent != NULL);
+
+	spin_lock(&ent->guard);
+	ent->flushers ++;
+#if REISER4_DEBUG
+	flushers_list_push_front(&ent->flushers_list, get_current_context());
+#endif
+	spin_unlock(&ent->guard);
+}
+
+/* called at the end of jnode_flush */
+reiser4_internal void leave_flush (struct super_block * super)
+{
+	entd_context * ent;
+
+	assert ("zam-1027", super != NULL);
+	ent = get_entd_context(super);
+
+	assert ("zam-1028", ent != NULL);
+
+	spin_lock(&ent->guard);
+	ent->flushers --;
+	if (ent->flushers == 0 && ent->nr_synchronous_requests != 0)
+		kcond_signal(&ent->wait);
+#if REISER4_DEBUG
+	flushers_list_remove_clean(get_current_context());
+#endif
+	spin_unlock(&ent->guard);
+}
+
+/* signal to ent thread that it has more work to do */
+static void kick_entd(entd_context * ent)
+{
+	kcond_signal(&ent->wait);
+}
+
+static void entd_capture_anonymous_pages(
+	struct super_block * super, struct writeback_control * wbc)
+{
+	spin_lock(&inode_lock);
+	capture_reiser4_inodes(super, wbc);
+	spin_unlock(&inode_lock);
+}
+
+static void entd_flush(struct super_block *super)
+{
+	long            nr_submitted = 0;
+	int             result;
+	reiser4_context ctx;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 32,
+		.nonblocking	= 0,
+	};
+
+	init_context(&ctx, super);
+
+	ctx.entd = 1;
+
+	entd_capture_anonymous_pages(super, &wbc);
+	result = flush_some_atom(&nr_submitted, &wbc, 0);
+	if (result != 0)
+		warning("nikita-3100", "Flush failed: %i", result);
+
+	context_set_commit_async(&ctx);
+	reiser4_exit_context(&ctx);
+}
+
+void write_page_by_ent (struct page * page, struct writeback_control * wbc)
+{
+	struct super_block * sb;
+	entd_context * ent;
+	struct wbq rq;
+	int phantom;
+
+	sb = page->mapping->host->i_sb;
+	ent = get_entd_context(sb);
+
+	phantom = jprivate(page) == NULL || !jnode_check_dirty(jprivate(page));
+	/* re-dirty page */
+	set_page_dirty_internal(page, phantom);
+	/* unlock it to avoid deadlocks with the thread which will do actual i/o  */
+	unlock_page(page);
+
+	/* entd is not running. */
+	if (ent == NULL || ent->done)
+		return;
+
+	/* init wbq */
+	wbq_list_clean(&rq);
+	rq.nr_entd_iters = 0;
+	rq.page = page;
+	rq.wbc = wbc;
+
+	spin_lock(&ent->guard);
+	if (ent->flushers == 0)
+		kick_entd(ent);
+	ent->nr_all_requests ++;
+	if (ent->nr_all_requests <= ent->nr_synchronous_requests + ENTD_ASYNC_REQUESTS_LIMIT) {
+		spin_unlock(&ent->guard);
+		return;
+	}
+	sema_init(&rq.sem, 0);
+	wbq_list_push_back(&ent->wbq_list, &rq);
+	ent->nr_synchronous_requests ++;
+	spin_unlock(&ent->guard);
+	down(&rq.sem);
+
+	/* don't release rq until wakeup_wbq stops using it. */
+	spin_lock(&ent->guard);
+	spin_unlock(&ent->guard);
+	/* wbq dequeued by the ent thread (by another then current thread). */
+}
+
+/* ent should be locked */
+static struct wbq * get_wbq (entd_context * ent)
+{
+	if (wbq_list_empty(&ent->wbq_list)) {
+		spin_unlock(&ent->guard);
+		return NULL;
+	}
+	return wbq_list_front(&ent->wbq_list);
+}
+
+
+void ent_writes_page (struct super_block * sb, struct page * page)
+{
+	entd_context * ent = get_entd_context(sb);
+	struct wbq * rq;
+
+	assert("zam-1041", ent != NULL);
+
+	if (PageActive(page) || ent->nr_all_requests == 0)
+		return;
+
+	SetPageReclaim(page);
+
+	spin_lock(&ent->guard);
+	if (ent->nr_all_requests > 0) {
+		ent->nr_all_requests --;
+		rq = get_wbq(ent);
+		if (rq == NULL)
+			/* get_wbq() releases entd->guard spinlock if NULL is
+			 * returned. */
+			return;
+		wakeup_wbq(ent, rq);
+	}
+	spin_unlock(&ent->guard);
+}
+
+int wbq_available (void) {
+	struct super_block * sb = reiser4_get_current_sb();
+	entd_context * ent = get_entd_context(sb);
+	return ent->nr_all_requests;
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 80
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/entd.h
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/entd.h	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/entd.h	2004-08-22 19:35:33.636650307 +1000
@@ -0,0 +1,83 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Ent daemon. */
+
+#ifndef __ENTD_H__
+#define __ENTD_H__
+
+#include "kcond.h"
+#include "context.h"
+
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>	/* for struct task_struct */
+#include "type_safe_list.h"
+
+TYPE_SAFE_LIST_DECLARE(wbq);
+
+/* write-back request. */
+struct wbq {
+	wbq_list_link link;
+	struct writeback_control * wbc;
+	struct page * page;
+	struct semaphore sem;
+	int    nr_entd_iters;
+};
+
+/* ent-thread context. This is used to synchronize starting/stopping ent
+ * threads. */
+typedef struct entd_context {
+	/*
+	 * condition variable that is signaled by ent thread after it
+	 * successfully started up.
+	 */
+	kcond_t             startup;
+	/*
+	 * completion that is signaled by ent thread just before it
+	 * terminates.
+	 */
+	struct completion   finish;
+	/*
+	 * condition variable that ent thread waits on for more work. It's
+	 * signaled by write_page_by_ent().
+	 */
+	kcond_t             wait;
+	/* spinlock protecting other fields */
+	spinlock_t          guard;
+	/* ent thread */
+	struct task_struct *tsk;
+	/* set to indicate that ent thread should leave. */
+	int                 done;
+	/* counter of active flushers */
+	int                 flushers;
+#if REISER4_DEBUG
+	/* list of all active flushers */
+	flushers_list_head  flushers_list;
+#endif
+	int                 nr_all_requests;
+	int                 nr_synchronous_requests;
+	wbq_list_head       wbq_list;
+} entd_context;
+
+extern void init_entd_context(struct super_block *super);
+extern void done_entd_context(struct super_block *super);
+
+extern void enter_flush(struct super_block *super);
+extern void leave_flush(struct super_block *super);
+
+extern void write_page_by_ent(struct page *, struct writeback_control *);
+extern int  wbq_available (void);
+extern void ent_writes_page (struct super_block *, struct page *);
+/* __ENTD_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/eottl.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/eottl.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/eottl.c	2004-08-22 19:35:33.637650147 +1000
@@ -0,0 +1,372 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "tree_mod.h"
+#include "carry.h"
+#include "tree.h"
+#include "super.h"
+
+#include <linux/types.h>	/* for __u??  */
+
+/* Extents on the twig level (EOTTL) handling.
+
+   EOTTL poses some problems to the tree traversal, that are better
+   explained by example.
+
+   Suppose we have block B1 on the twig level with the following items:
+
+   0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id, offset)
+   1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
+   2. internal item I2 with key (10:0:0:0)
+
+   We are trying to insert item with key (5:0:0:0). Lookup finds node
+   B1, and then intra-node lookup is done. This lookup finished on the
+   E1, because the key we are looking for is larger than the key of E1
+   and is smaller than key the of I2.
+
+   Here search is stuck.
+
+   After some thought it is clear what is wrong here: extents on the
+   twig level break some basic property of the *search* tree (on the
+   pretext, that they restore property of balanced tree).
+
+   Said property is the following: if in the internal node of the search
+   tree we have [ ... Key1 Pointer Key2 ... ] then, all data that are or
+   will be keyed in the tree with the Key such that Key1 <= Key < Key2
+   are accessible through the Pointer.
+
+   This is not true, when Pointer is Extent-Pointer, simply because
+   extent cannot expand indefinitely to the right to include any item
+   with
+
+     Key1 <= Key <= Key2.
+
+   For example, our E1 extent is only responsible for the data with keys
+
+     (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
+
+   so, key range
+
+     ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
+
+   is orphaned: there is no way to get there from the tree root.
+
+   In other words, extent pointers are different than normal child
+   pointers as far as search tree is concerned, and this creates such
+   problems.
+
+   Possible solution for this problem is to insert our item into node
+   pointed to by I2. There are some problems through:
+
+   (1) I2 can be in a different node.
+   (2) E1 can be immediately followed by another extent E2.
+
+   (1) is solved by calling reiser4_get_right_neighbor() and accounting
+   for locks/coords as necessary.
+
+   (2) is more complex. Solution here is to insert new empty leaf node
+   and insert internal item between E1 and E2 pointing to said leaf
+   node. This is further complicated by possibility that E2 is in a
+   different node, etc.
+
+   Problems:
+
+   (1) if there was internal item I2 immediately on the right of an
+   extent E1 we and we decided to insert new item S1 into node N2
+   pointed to by I2, then key of S1 will be less than smallest key in
+   the N2. Normally, search key checks that key we are looking for is in
+   the range of keys covered by the node key is being looked in. To work
+   around of this situation, while preserving useful consistency check
+   new flag CBK_TRUST_DK was added to the cbk falgs bitmask. This flag
+   is automatically set on entrance to the coord_by_key() and is only
+   cleared when we are about to enter situation described above.
+
+   (2) If extent E1 is immediately followed by another extent E2 and we
+   are searching for the key that is between E1 and E2 we only have to
+   insert new empty leaf node when coord_by_key was called for
+   insertion, rather than just for lookup. To distinguish these cases,
+   new flag CBK_FOR_INSERT was added to the cbk falgs bitmask. This flag
+   is automatically set by coord_by_key calls performed by
+   insert_by_key() and friends.
+
+   (3) Insertion of new empty leaf node (possibly) requires
+   balancing. In any case it requires modification of node content which
+   is only possible under write lock. It may well happen that we only
+   have read lock on the node where new internal pointer is to be
+   inserted (common case: lookup of non-existent stat-data that fells
+   between two extents). If only read lock is held, tree traversal is
+   restarted with lock_level modified so that next time we hit this
+   problem, write lock will be held. Once we have write lock, balancing
+   will be performed.
+
+
+
+
+
+
+*/
+
+/* look to the right of @coord. If it is an item of internal type - 1 is
+   returned. If that item is in right neighbor and it is internal - @coord and
+   @lh are switched to that node: move lock handle, zload right neighbor and
+   zrelse znode coord was set to at the beginning
+*/
+/* Audited by: green(2002.06.15) */
+static int
+is_next_item_internal(coord_t * coord)
+{
+	if (coord->item_pos != node_num_items(coord->node) - 1) {
+		/* next item is in the same node */
+		coord_t right;
+
+		coord_dup(&right, coord);
+		check_me("vs-742", coord_next_item(&right) == 0);
+		if (item_is_internal(&right)) {
+			coord_dup(coord, &right);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/* inserting empty leaf after (or between) item of not internal type we have to
+   know which right delimiting key corresponding znode has to be inserted with */
+static reiser4_key *
+rd_key(coord_t * coord, reiser4_key * key)
+{
+	coord_t dup;
+
+	assert("nikita-2281", coord_is_between_items(coord));
+	coord_dup(&dup, coord);
+
+	RLOCK_DK(current_tree);
+
+	if (coord_set_to_right(&dup) == 0)
+		/* get right delimiting key from an item to the right of @coord */
+		unit_key_by_coord(&dup, key);
+	else
+		/* use right delimiting key of parent znode */
+		*key = *znode_get_rd_key(coord->node);
+
+	RUNLOCK_DK(current_tree);
+	return key;
+}
+
+
+ON_DEBUG(void check_dkeys(const znode *);)
+
+/* this is used to insert empty node into leaf level if tree lookup can not go
+   further down because it stopped between items of not internal type */
+static int
+add_empty_leaf(coord_t * insert_coord, lock_handle * lh, const reiser4_key * key, const reiser4_key * rdkey)
+{
+	int result;
+	carry_pool pool;
+	carry_level todo;
+	carry_op *op;
+	/*znode *parent_node;*/
+	znode *node;
+	reiser4_item_data item;
+	carry_insert_data cdata;
+	reiser4_tree *tree;
+
+	init_carry_pool(&pool);
+	init_carry_level(&todo, &pool);
+	ON_STATS(todo.level_no = TWIG_LEVEL);
+	assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
+
+	tree = znode_get_tree(insert_coord->node);
+	node = new_node(insert_coord->node, LEAF_LEVEL);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	/* setup delimiting keys for node being inserted */
+	WLOCK_DK(tree);
+	znode_set_ld_key(node, key);
+	znode_set_rd_key(node, rdkey);
+	ON_DEBUG(node->creator = current);
+	ON_DEBUG(node->first_key = *key);
+	WUNLOCK_DK(tree);
+
+	ZF_SET(node, JNODE_ORPHAN);
+	op = post_carry(&todo, COP_INSERT, insert_coord->node, 0);
+	if (!IS_ERR(op)) {
+		cdata.coord = insert_coord;
+		cdata.key = key;
+		cdata.data = &item;
+		op->u.insert.d = &cdata;
+		op->u.insert.type = COPT_ITEM_DATA;
+		build_child_ptr_data(node, &item);
+		item.arg = NULL;
+		/* have @insert_coord to be set at inserted item after
+		   insertion is done */
+		todo.track_type = CARRY_TRACK_CHANGE;
+		todo.tracked = lh;
+
+		result = carry(&todo, 0);
+		if (result == 0) {
+			/*
+			 * pin node in memory. This is necessary for
+			 * znode_make_dirty() below.
+			 */
+			result = zload(node);
+			if (result == 0) {
+				lock_handle local_lh;
+
+				/*
+				 * if we inserted new child into tree we have
+				 * to mark it dirty so that flush will be able
+				 * to process it.
+				 */
+				init_lh(&local_lh);
+				result = longterm_lock_znode(&local_lh, node,
+							     ZNODE_WRITE_LOCK,
+							     ZNODE_LOCK_LOPRI);
+				if (result == 0) {
+					znode_make_dirty(node);
+
+					/* when internal item pointing to @node
+					   was inserted into twig node
+					   create_hook_internal did not connect
+					   it properly because its right
+					   neighbor was not known. Do it
+					   here */
+					WLOCK_TREE(tree);
+					assert("nikita-3312", znode_is_right_connected(node));
+					assert("nikita-2984", node->right == NULL);
+					ZF_CLR(node, JNODE_RIGHT_CONNECTED);
+					WUNLOCK_TREE(tree);
+					result = connect_znode(insert_coord, node);
+					if (result == 0)
+						ON_DEBUG(check_dkeys(node));
+
+					done_lh(lh);
+					move_lh(lh, &local_lh);
+					assert("vs-1676", node_is_empty(node));
+					coord_init_first_unit(insert_coord, node);
+				} else {
+					warning("nikita-3136",
+						"Cannot lock child");
+					print_znode("child", node);
+				}
+				done_lh(&local_lh);
+				zrelse(node);
+			}
+		}
+	} else
+		result = PTR_ERR(op);
+	zput(node);
+	done_carry_pool(&pool);
+	return result;
+}
+
+/* handle extent-on-the-twig-level cases in tree traversal */
+reiser4_internal int
+handle_eottl(cbk_handle * h /* cbk handle */ ,
+	     int *outcome /* how traversal should proceed */ )
+{
+	int result;
+	reiser4_key key;
+	coord_t *coord;
+
+	coord = h->coord;
+
+	if (h->level != TWIG_LEVEL || (coord_is_existing_item(coord) && item_is_internal(coord))) {
+		/* Continue to traverse tree downward. */
+		return 0;
+	}
+	/* strange item type found on non-stop level?!  Twig
+	   horrors? */
+	assert("vs-356", h->level == TWIG_LEVEL);
+	assert("vs-357", ( {
+			  coord_t lcoord;
+			  coord_dup(&lcoord, coord);
+			  check_me("vs-733", coord_set_to_left(&lcoord) == 0);
+			  item_is_extent(&lcoord);}
+	       ));
+
+	if (*outcome == NS_FOUND) {
+		/* we have found desired key on twig level in extent item */
+		h->result = CBK_COORD_FOUND;
+		reiser4_stat_inc(tree.cbk_found);
+		*outcome = LOOKUP_DONE;
+		return 1;
+	}
+
+	if (!(h->flags & CBK_FOR_INSERT)) {
+		/* tree traversal is not for insertion. Just return
+		   CBK_COORD_NOTFOUND. */
+		h->result = CBK_COORD_NOTFOUND;
+		*outcome = LOOKUP_DONE;
+		return 1;
+	}
+
+	/* take a look at the item to the right of h -> coord */
+	result = is_next_item_internal(coord);
+	if (result == 0) {
+		/* item to the right is also an extent one. Allocate a new node
+		   and insert pointer to it after item h -> coord.
+
+		   This is a result of extents being located at the twig
+		   level. For explanation, see comment just above
+		   is_next_item_internal().
+		*/
+		if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
+			/* we got node read locked, restart coord_by_key to
+			   have write lock on twig level */
+			h->lock_level = TWIG_LEVEL;
+			h->lock_mode = ZNODE_WRITE_LOCK;
+			*outcome = LOOKUP_REST;
+			return 1;
+		}
+
+		result = add_empty_leaf(coord, h->active_lh, h->key, rd_key(coord, &key));
+		if (result) {
+			h->error = "could not add empty leaf";
+			h->result = result;
+			*outcome = LOOKUP_DONE;
+			return 1;
+		}
+		/* added empty leaf is locked, its parent node is unlocked,
+		   coord is set as EMPTY */
+		*outcome = LOOKUP_DONE;
+		h->result = CBK_COORD_NOTFOUND;
+		return 1;
+		/*assert("vs-358", keyeq(h->key, item_key_by_coord(coord, &key)));*/
+	} else {
+		/* this is special case mentioned in the comment on
+		   tree.h:cbk_flags. We have found internal item immediately
+		   on the right of extent, and we are going to insert new item
+		   there. Key of item we are going to insert is smaller than
+		   leftmost key in the node pointed to by said internal item
+		   (otherwise search wouldn't come to the extent in the first
+		   place).
+
+		   This is a result of extents being located at the twig
+		   level. For explanation, see comment just above
+		   is_next_item_internal().
+		*/
+		h->flags &= ~CBK_TRUST_DK;
+	}
+	assert("vs-362", item_is_internal(coord));
+	return 0;
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/estimate.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/estimate.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/estimate.c	2004-08-22 19:35:33.637650147 +1000
@@ -0,0 +1,107 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "debug.h"
+#include "dformat.h"
+#include "tree.h"
+#include "carry.h"
+#include "inode.h"
+#include "cluster.h"
+#include "plugin/item/ctail.h"
+
+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
+
+   Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
+   is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
+   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
+   leaf level, 3 for twig level, 2 on upper + 1 for root.
+
+   Do not calculate the current node of the lowest level here - this is overhead only.
+
+   children is almost always 1 here. Exception is flow insertion
+*/
+static reiser4_block_nr
+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
+{
+	reiser4_block_nr ten_percent;
+
+	ten_percent = ((103 * childen) >> 10);
+
+	/* If we have too many balancings at the time, tree height can raise on more
+	   then 1. Assume that if tree_height is 5, it can raise on 1 only. */
+	return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
+}
+
+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
+   perform insertion of one item into the tree */
+/* it is only called when tree height changes, or gets initialized */
+reiser4_internal reiser4_block_nr
+calc_estimate_one_insert(tree_level height)
+{
+	return 1 + max_balance_overhead(1, height);
+}
+
+reiser4_internal reiser4_block_nr
+estimate_internal_amount(reiser4_block_nr children, tree_level tree_height)
+{
+	return max_balance_overhead(children, tree_height);
+}
+
+reiser4_internal reiser4_block_nr
+estimate_one_insert_item(reiser4_tree *tree)
+{
+	return tree->estimate_one_insert;
+}
+
+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
+   perform insertion of one unit into an item in the tree */
+reiser4_internal reiser4_block_nr
+estimate_one_insert_into_item(reiser4_tree *tree)
+{
+	/* estimate insert into item just like item insertion */
+	return tree->estimate_one_insert;
+}
+
+reiser4_internal reiser4_block_nr
+estimate_one_item_removal(reiser4_tree *tree)
+{
+	/* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
+	   level */
+	return tree->estimate_one_insert;
+}
+
+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
+   both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
+   levels */
+reiser4_internal reiser4_block_nr
+estimate_insert_flow(tree_level height)
+{
+	return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 + CARRY_FLOW_NEW_NODES_LIMIT, height);
+}
+
+/* returnes max number of nodes can be occupied by disk cluster */
+reiser4_internal reiser4_block_nr
+estimate_disk_cluster(struct inode * inode)
+{
+	return 2 + inode_cluster_pages(inode);
+}
+
+/* how many nodes might get dirty and added nodes during insertion of a disk cluster */
+reiser4_internal reiser4_block_nr
+estimate_insert_cluster(struct inode * inode, int unprepped)
+{
+	int per_cluster;
+	per_cluster = (unprepped ? 1 : inode_cluster_pages(inode));
+
+	return 3 + per_cluster + max_balance_overhead(3 + per_cluster, REISER4_MAX_ZTREE_HEIGHT);
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/file_ops.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/file_ops.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/file_ops.c	2004-08-22 19:35:33.638649988 +1000
@@ -0,0 +1,458 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/*
+ * Interface to VFS. Reiser4 file_operations are defined here.
+ *
+ * This file contains definitions of functions that are installed into ->i_fop
+ * field of reiser4 inodes.
+ *
+ * By the most part these functions simply find object plugin of inode
+ * involved, and call appropriate plugin method to do the actual work.
+ */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/file/file.h"
+#include "plugin/security/perm.h"
+#include "plugin/disk_format/disk_format.h"
+#include "plugin/plugin.h"
+#include "plugin/plugin_set.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "log.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "ktxnmgrd.h"
+#include "super.h"
+#include "reiser4.h"
+#include "kattr.h"
+#include "entd.h"
+#include "emergency_flush.h"
+
+#include <linux/profile.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/quotaops.h>
+#include <linux/security.h>
+
+
+/* file operations */
+
+static loff_t reiser4_llseek(struct file *, loff_t, int);
+static ssize_t reiser4_read(struct file *, char *, size_t, loff_t *);
+static ssize_t reiser4_write(struct file *, const char *, size_t, loff_t *);
+static int reiser4_readdir(struct file *, void *, filldir_t);
+static int reiser4_ioctl(struct inode *, struct file *, unsigned int cmd, unsigned long arg);
+static int reiser4_mmap(struct file *, struct vm_area_struct *);
+static int reiser4_release(struct inode *, struct file *);
+static int reiser4_fsync(struct file *, struct dentry *, int datasync);
+static int reiser4_open(struct inode *, struct file *);
+static ssize_t reiser4_sendfile(struct file *, loff_t *, size_t, read_actor_t, void __user *);
+
+#if 0
+static unsigned int reiser4_poll(struct file *, struct poll_table_struct *);
+static int reiser4_flush(struct file *);
+static int reiser4_fasync(int, struct file *, int);
+static int reiser4_lock(struct file *, int, struct file_lock *);
+static ssize_t reiser4_readv(struct file *, const struct iovec *, unsigned long, loff_t *);
+static ssize_t reiser4_writev(struct file *, const struct iovec *, unsigned long, loff_t *);
+static ssize_t reiser4_sendpage(struct file *, struct page *, int, size_t, loff_t *, int);
+static unsigned long reiser4_get_unmapped_area(struct file *, unsigned long,
+					       unsigned long, unsigned long, unsigned long);
+#endif
+
+/*
+ * ->llseek() file operation for reiser4. Calls ->seek() method of object
+ * plugin.
+ */
+static loff_t
+reiser4_llseek(struct file *file, loff_t off, int origin)
+{
+	loff_t result;
+	file_plugin *fplug;
+	struct inode *inode = file->f_dentry->d_inode;
+	loff_t(*seek_fn) (struct file *, loff_t, int);
+	reiser4_context ctx;
+
+	init_context(&ctx, inode->i_sb);
+	reiser4_stat_inc(vfs_calls.llseek);
+
+	ON_TRACE(TRACE_VFS_OPS,
+		 "llseek: (i_ino %li, size %lld): off %lli, origin %d\n", inode->i_ino, inode->i_size, off, origin);
+
+	fplug = inode_file_plugin(inode);
+	assert("nikita-2291", fplug != NULL);
+	seek_fn = fplug->seek ? : generic_file_llseek;
+	result = seek_fn(file, off, origin);
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* reiser4_readdir() - our readdir() method.
+
+   readdir(2)/getdents(2) interface is based on implicit assumption that
+   readdir can be restarted from any particular point by supplying file
+   system with off_t-full of data. That is, file system fill ->d_off
+   field in struct dirent and later user passes ->d_off to the
+   seekdir(3), which is, actually, implemented by glibc as lseek(2) on
+   directory.
+
+   Reiser4 cannot restart readdir from 64 bits of data, because two last
+   components of the key of directory entry are unknown, which given 128
+   bits: locality and type fields in the key of directory entry are
+   always known, to start readdir() from given point objectid and offset
+   fields have to be filled.
+
+   See plugin/dir/dir.c:readdir_common() for the details of our solution.
+*/
+static int
+reiser4_readdir(struct file *f /* directory file being read */ ,
+		void *dirent /* opaque data passed to us by VFS */ ,
+		filldir_t filldir	/* filler function passed to us
+					 * by VFS */ )
+{
+	dir_plugin *dplug;
+	int result;
+	struct inode *inode;
+	reiser4_context ctx;
+
+	inode = f->f_dentry->d_inode;
+	init_context(&ctx, inode->i_sb);
+	write_syscall_log("%s", f->f_dentry->d_name.name);
+	reiser4_stat_inc(vfs_calls.readdir);
+
+	dplug = inode_dir_plugin(inode);
+	if ((dplug != NULL) && (dplug->readdir != NULL))
+		result = dplug->readdir(f, dirent, filldir);
+	else
+		result = RETERR(-ENOTDIR);
+
+	/*
+	 * directory st_atime is updated by callers (if necessary).
+	 */
+	write_syscall_log("ex");
+	context_set_commit_async(&ctx);
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/*
+  reiser4_ioctl - handler for ioctl for inode supported commands:
+*/
+static int
+reiser4_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	int result;
+	reiser4_context ctx;
+
+	init_context(&ctx, inode->i_sb);
+	write_syscall_log("%s", filp->f_dentry->d_name.name);
+	reiser4_stat_inc(vfs_calls.ioctl);
+
+	if (inode_file_plugin(inode)->ioctl == NULL)
+		result = -ENOSYS;
+	else
+		result = inode_file_plugin(inode)->ioctl(inode, filp, cmd, arg);
+
+	write_syscall_log("ex");
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* ->mmap() VFS method in reiser4 file_operations */
+static int
+reiser4_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode;
+	int result;
+	reiser4_context ctx;
+
+	init_context(&ctx, file->f_dentry->d_inode->i_sb);
+	write_syscall_log("%s", file->f_dentry->d_name.name);
+	reiser4_stat_inc(vfs_calls.mmap);
+
+	ON_TRACE(TRACE_VFS_OPS, "MMAP: (i_ino %lli, size %lld)\n",
+		 get_inode_oid(file->f_dentry->d_inode),
+		 file->f_dentry->d_inode->i_size);
+
+	inode = file->f_dentry->d_inode;
+	assert("nikita-2936", inode_file_plugin(inode)->mmap != NULL);
+	result = inode_file_plugin(inode)->mmap(file, vma);
+	write_syscall_log("ex");
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* reiser4 implementation of ->read() VFS method, member of reiser4 struct file_operations
+
+ reads some part of a file from the filesystem into the user space buffer
+
+ gets the plugin for the file and calls its read method which does everything except some initialization
+
+*/
+static ssize_t
+reiser4_read(struct file *file /* file to read from */ ,
+	     char *buf		/* user-space buffer to put data read
+				 * from the file */ ,
+	     size_t count /* bytes to read */ ,
+	     loff_t * off	/* current position within the file, which needs to be increased by the act of reading. Reads
+				 * start from here. */ )
+{
+	ssize_t result;
+	struct inode *inode;
+	reiser4_context ctx;
+
+	assert("umka-072", file != NULL);
+	assert("umka-073", buf != NULL);
+	assert("umka-074", off != NULL);
+
+	inode = file->f_dentry->d_inode;
+	init_context(&ctx, inode->i_sb);
+	write_syscall_log("%s", file->f_dentry->d_name.name);
+	reiser4_stat_inc(vfs_calls.read);
+
+	ON_TRACE(TRACE_VFS_OPS,
+		 "READ: (i_ino %li, size %lld): %u bytes from pos %lli\n",
+		 inode->i_ino, inode->i_size, count, *off);
+
+	result = perm_chk(inode, read, file, buf, count, off);
+	if (likely(result == 0)) {
+		file_plugin *fplug;
+
+		fplug = inode_file_plugin(inode);
+		assert("nikita-417", fplug != NULL);
+		assert("nikita-2935", fplug->write != NULL);
+
+		/* unix_file_read is one method that might be invoked below */
+		result = fplug->read(file, buf, count, off);
+	}
+	write_syscall_log("ex");
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* ->write() VFS method in reiser4 file_operations */
+static ssize_t
+reiser4_write(struct file *file /* file to write on */ ,
+	      const char *buf	/* user-space buffer to get data
+				 * to write into the file */ ,
+	      size_t size /* bytes to write */ ,
+	      loff_t * off	/* offset to start writing
+				 * from. This is updated to indicate
+				 * actual number of bytes written */ )
+{
+	struct inode *inode;
+	ssize_t result;
+	reiser4_context ctx;
+
+	assert("nikita-1421", file != NULL);
+	assert("nikita-1422", buf != NULL);
+	assert("nikita-1424", off != NULL);
+
+	inode = file->f_dentry->d_inode;
+	init_context(&ctx, inode->i_sb);
+	write_syscall_log("%s", file->f_dentry->d_name.name);
+	reiser4_stat_inc(vfs_calls.write);
+
+	ON_TRACE(TRACE_VFS_OPS,
+		 "WRITE: (i_ino %li, size %lld): %u bytes to pos %lli\n", inode->i_ino, inode->i_size, size, *off);
+
+	result = perm_chk(inode, write, file, buf, size, off);
+	if (likely(result == 0)) {
+		file_plugin *fplug;
+
+		fplug = inode_file_plugin(inode);
+		assert("nikita-2934", fplug->read != NULL);
+
+		result = fplug->write(file, buf, size, off);
+	}
+	write_syscall_log("ex");
+	context_set_commit_async(&ctx);
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* Release reiser4 file. This is f_op->release() method. Called when last
+   holder closes a file */
+static int
+reiser4_release(struct inode *i /* inode released */ ,
+		struct file *f /* file released */ )
+{
+	file_plugin *fplug;
+	int result;
+	reiser4_context ctx;
+
+	assert("umka-081", i != NULL);
+	assert("nikita-1447", f != NULL);
+
+	init_context(&ctx, i->i_sb);
+	fplug = inode_file_plugin(i);
+	assert("umka-082", fplug != NULL);
+
+	ON_TRACE(TRACE_VFS_OPS,
+		 "RELEASE: (i_ino %li, size %lld)\n", i->i_ino, i->i_size);
+
+	if (fplug->release != NULL && get_current_context() == &ctx)
+		result = fplug->release(i, f);
+	else
+		/*
+		  no ->release method defined, or we are within reiser4
+		  context already. How latter is possible? Simple:
+
+		  (gdb) bt
+		  #0  get_exclusive_access ()
+		  #2  0xc01e56d3 in release_unix_file ()
+		  #3  0xc01c3643 in reiser4_release ()
+		  #4  0xc014cae0 in __fput ()
+		  #5  0xc013ffc3 in remove_vm_struct ()
+		  #6  0xc0141786 in exit_mmap ()
+		  #7  0xc0118480 in mmput ()
+		  #8  0xc0133205 in oom_kill ()
+		  #9  0xc01332d1 in out_of_memory ()
+		  #10 0xc013bc1d in try_to_free_pages ()
+		  #11 0xc013427b in __alloc_pages ()
+		  #12 0xc013f058 in do_anonymous_page ()
+		  #13 0xc013f19d in do_no_page ()
+		  #14 0xc013f60e in handle_mm_fault ()
+		  #15 0xc01131e5 in do_page_fault ()
+		  #16 0xc0104935 in error_code ()
+		  #17 0xc025c0c6 in __copy_to_user_ll ()
+		  #18 0xc01d496f in read_tail ()
+		  #19 0xc01e4def in read_unix_file ()
+		  #20 0xc01c3504 in reiser4_read ()
+		  #21 0xc014bd4f in vfs_read ()
+		  #22 0xc014bf66 in sys_read ()
+		*/
+		result = 0;
+
+	reiser4_free_file_fsdata(f);
+
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/*
+ * ->open file operation for reiser4. This is optional method. It's only
+ * present for mounts that support pseudo files. When "nopseudo" mount option
+ * is used, this method is zeroed, which speeds open(2) system call a bit.
+ */
+static int
+reiser4_open(struct inode * inode, struct file * file)
+{
+	int result;
+
+	reiser4_context ctx;
+	file_plugin *fplug;
+
+	init_context(&ctx, inode->i_sb);
+	reiser4_stat_inc(vfs_calls.open);
+	fplug = inode_file_plugin(inode);
+
+	if (fplug->open != NULL)
+		result = fplug->open(inode, file);
+	else
+		result = 0;
+
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* ->fsync file operation for reiser4. */
+static int
+reiser4_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	int result;
+	reiser4_context ctx;
+	file_plugin *fplug;
+	struct inode *inode;
+
+	inode = dentry->d_inode;
+	init_context(&ctx, inode->i_sb);
+	fplug = inode_file_plugin(inode);
+	if (fplug->sync != NULL)
+		result = fplug->sync(inode, datasync);
+	else
+		result = 0;
+	context_set_commit_async(&ctx);
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+/* Reads @count bytes from @file and calls @actor for every read page. This is
+   needed for loop back devices support. */
+static ssize_t reiser4_sendfile(struct file *file, loff_t *ppos,
+				size_t count, read_actor_t actor,
+				void __user *target)
+{
+	int result;
+	file_plugin *fplug;
+	reiser4_context ctx;
+	struct inode *inode;
+
+	inode = file->f_dentry->d_inode;
+	init_context(&ctx, inode->i_sb);
+
+	fplug = inode_file_plugin(inode);
+
+	if (fplug->sendfile != NULL)
+		result = fplug->sendfile(file, ppos, count, actor, target);
+	else
+		result = RETERR(-EINVAL);
+
+	reiser4_exit_context(&ctx);
+	return result;
+}
+
+
+struct file_operations reiser4_file_operations = {
+	.llseek   = reiser4_llseek,	/* d */
+	.read     = reiser4_read,	/* d */
+	.write    = reiser4_write,	/* d */
+	.readdir  = reiser4_readdir,	/* d */
+/* 	.poll              = reiser4_poll, */
+	.ioctl    = reiser4_ioctl,
+	.mmap     = reiser4_mmap,	/* d */
+ 	.open              = reiser4_open,
+/* 	.flush             = reiser4_flush, */
+	.release  = reiser4_release,	/* d */
+ 	.fsync    = reiser4_fsync        /* d */,
+	.sendfile = reiser4_sendfile,
+/* 	.fasync            = reiser4_fasync, */
+/* 	.lock              = reiser4_lock, */
+/* 	.readv             = reiser4_readv, */
+/* 	.writev            = reiser4_writev, */
+/* 	.sendpage          = reiser4_sendpage, */
+/* 	.get_unmapped_area = reiser4_get_unmapped_area */
+};
+
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
Index: linux-2.6.8.1-ck/fs/reiser4/flush.c
===================================================================
--- linux-2.6.8.1-ck.orig/fs/reiser4/flush.c	2003-03-27 19:01:40.000000000 +1100
+++ linux-2.6.8.1-ck/fs/reiser4/flush.c	2004-08-22 19:35:33.643649191 +1000
@@ -0,0 +1,3832 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "type_safe_list.h"
+#include "plugin/item/item.h"
+#include "plugin/plugin.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "carry.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "wander.h"
+#include "super.h"
+#include "log.h"
+#include "entd.h"
+#include "reiser4.h"
+#include "prof.h"
+#include "flush.h"
+#include "writeout.h"
+
+#include <asm/atomic.h>
+#include <linux/fs.h>		/* for struct super_block  */
+#include <linux/mm.h>		/* for struct page */
+#include <linux/bio.h>		/* for struct bio */
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+
+/* IMPLEMENTATION NOTES */
+
+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
+   order to the nodes of the tree in which the parent is placed before its children, which
+   are ordered (recursively) in left-to-right order.  When we speak of a "parent-first preceder", it
+   describes the node that "came before in forward parent-first order".  When we speak of a
+   "parent-first follower", it describes the node that "comes next in parent-first
+   order" (alternatively the node that "came before in reverse parent-first order").
+
+   The following pseudo-code prints the nodes of a tree in forward parent-first order:
+
+   void parent_first (node)
+   {
+     print_node (node);
+     if (node->level > leaf) {
+       for (i = 0; i < num_children; i += 1) {
+         parent_first (node->child[i]);
+       }
+     }
+   }
+*/
+
+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block allocation so
+   that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
+   can be accomplished with sequential reads, which results in reading nodes in their
+   parent-first order.  This is a read-optimization aspect of the flush algorithm, and
+   there is also a write-optimization aspect, which is that we wish to make large
+   sequential writes to the disk by allocating or reallocating blocks so that they can be
+   written in sequence.  Sometimes the read-optimization and write-optimization goals
+   conflict with each other, as we discuss in more detail below.
+*/
+
+/* STATE BITS: The flush code revolves around the state of the jnodes it covers.  Here are
+   the relevant jnode->state bits and their relevence to flush:
+
+     JNODE_DIRTY: If a node is dirty, it must be flushed.  But in order to be written it
+     must be allocated first.  In order to be considered allocated, the jnode must have
+     exactly one of { JNODE_OVRWR, JNODE_RELOC } set.  These two bits are exclusive, and
+     all dirtied jnodes eventually have one of these bits set during each transaction.
+
+     JNODE_CREATED: The node was freshly created in its transaction and has no previous
+     block address, so it is unconditionally assigned to be relocated, although this is
+     mainly for code-convenience.  It is not being 'relocated' from anything, but in
+     almost every regard it is treated as part of the relocate set.  The JNODE_CREATED bit
+     remains set even after JNODE_RELOC is set, so the actual relocate can be
+     distinguished from the created-and-allocated set easily: relocate-set members
+     (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
+     have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
+
+     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
+     decision to maintain the pre-existing location for this node and it will be written
+     to the wandered-log.
+
+     JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
+     not created, see note above).  A block with JNODE_RELOC set is eligible for
+     early-flushing and may be submitted during flush_empty_queues.  When the JNODE_RELOC
+     bit is set on a znode, the parent node's internal item is modified and the znode is
+     rehashed.
+
+     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
+     and calls plugin->f.squeeze() method for its items. By this technology we update disk
+     clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
+     has this flag (races with write(), rare case) the flush algorythm makes the decision
+     to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
+     repeated allocation.
+
+     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
+     flush queue.  This means the jnode is not on any clean or dirty list, instead it is
+     moved to one of the flush queue (see flush_queue.h) object private list. This
+     prevents multiple concurrent flushes from attempting to start flushing from the
+     same node.
+
+     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
+     squeeze-and-allocate on a node while its children are actively being squeezed and
+     allocated.  This flag was created to avoid submitting a write request for a node
+     while its children are still being allocated and squeezed. Then flush queue was
+     re-implemented to allow unlimited number of nodes be queued. This flag support was
+     commented out in source code because we decided that there was no reason to submit
+     queued nodes before jnode_flush() finishes.  However, current code calls fq_write()
+     during a slum traversal and may submit "busy nodes" to disk. Probably we can
+     re-enable the JNODE_FLUSH_BUSY bit support in future.
+
+   With these state bits, we describe a test used frequently in the code below,
+   jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()).  The
+   test for "flushprepped" returns true if any of the following are true:
+
+     - The node is not dirty
+     - The node has JNODE_RELOC set
+     - The node has JNODE_OVRWR set
+
+   If either the node is not dirty or it has already been processed by flush (and assigned
+   JNODE_OVRWR or JNODE_RELOC), then it is prepped.  If jnode_is_flushprepped() returns
+   true then flush has work to do on that node.
+*/
+
+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
+   flushprepped twice (unless an explicit call to flush_unprep is made as described in
+   detail below).  For example a node is dirtied, allocated, and then early-flushed to
+   disk and set clean.  Before the transaction commits, the page is dirtied again and, due
+   to memory pressure, the node is flushed again.  The flush algorithm will not relocate
+   the node to a new disk location, it will simply write it to the same, previously
+   relocated position again.
+*/
+
+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
+   start at a leaf node and allocate in parent-first order by iterating to the right.  At
+   each step of the iteration, we check for the right neighbor.  Before advancing to the
+   right neighbor, we check if the current position and the right neighbor share the same
+   parent.  If they do not share the same parent, the parent is allocated before the right
+   neighbor.
+
+   This process goes recursively up the tree and squeeze nodes level by level as long as
+   the right neighbor and the current position have different parents, then it allocates
+   the right-neighbors-with-different-parents on the way back down.  This process is
+   described in more detail in flush_squalloc_changed_ancestor and the recursive function
+   squalloc_one_changed_ancestor.  But the purpose here is not to discuss the
+   specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
+   approaches.
+
+   The top-down algorithm was implemented earlier (April-May 2002).  In the top-down
+   approach, we find a starting point by scanning left along each level past dirty nodes,
+   then going up and repeating the process until the left node and the parent node are
+   clean.  We then perform a parent-first traversal from the starting point, which makes
+   allocating in parent-first order trivial.  After one subtree has been allocated in this
+   manner, we move to the right, try moving upward, then repeat the parent-first
+   traversal.
+
+   Both approaches have problems that need to be addressed.  Both are approximately the
+   same amount of code, but the bottom-up approach has advantages in the order it acquires
+   locks which, at the very least, make it the better approach.  At first glance each one
+   makes the other one look simpler, so it is important to remember a few of the problems
+   with each one.
+
+   Main problem with the top-down approach: When you encounter a clean child during the
+   parent-first traversal, what do you do?  You would like to avoid searching through a
+   large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
+   obvious solution.  One of the advantages of the top-down approach is that during the
+   parent-first traversal you check every child of a parent to see if it is dirty.  In
+   this way, the top-down approach easily handles the main problem of the bottom-up
+   approach: unallocated children.
+
+   The unallocated children problem is that before writing a node to disk we must make
+   sure that all of its children are allocated.  Otherwise, the writing the node means
+   extra I/O because the node will have to be written again when the child is finally
+   allocated.
+
+   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM.  Except for bugs, this
+   should not cause any file system corruption, it only degrades I/O performance because a
+   node may be written when it is sure to be written at least one more time in the same
+   transaction when the remaining children are allocated.  What follows is a description
+   of how we will solve the problem.
+*/
+
+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
+   proceeding in parent first order, allocate some of its left-children, then encounter a
+   clean child in the middle of the parent.  We do not allocate the clean child, but there
+   may remain unallocated (dirty) children to the right of the clean child.  If we were to
+   stop flushing at this moment and write everything to disk, the parent might still
+   contain unallocated children.
+
+   We could try to allocate all the descendents of every node that we allocate, but this
+   is not necessary.  Doing so could result in allocating the entire tree: if the root
+   node is allocated then every unallocated node would have to be allocated before
+   flushing.  Actually, we do not have to write a node just because we allocate it.  It is
+   possible to allocate but not write a node during flush, when it still has unallocated
+   children.  However, this approach is probably not optimal for the following reason.
+
+   The flush algorithm is designed to allocate nodes in parent-first order in an attempt
+   to optimize reads that occur in the same order.  Thus we are read-optimizing for a
+   left-to-right scan through all the leaves in the system, and we are hoping to
+   write-optimize at the same time because those nodes will be written together in batch.
+   What happens, however, if we assign a block number to a node in its read-optimized
+   order but then avoid writing it because it has unallocated children?  In that
+   situation, we lose out on the write-optimization aspect because a node will have to be
+   written again to the its location on the device, later, which likely means seeking back
+   to that location.
+
+   So there are tradeoffs. We can choose either:
+
+   A. Allocate all unallocated children to preserve both write-optimization and
+   read-optimization, but this is not always desirable because it may mean having to
+   allocate and flush very many nodes at once.
+
+   B. Defer writing nodes with unallocated children, keep their read-optimized locations,
+   but sacrifice write-optimization because those nodes will be written again.
+
+   C. Defer writing nodes with unallocated children, but do not keep their read-optimized
+   locations.  Instead, choose to write-optimize them later, when they are written.  To
+   facilitate this, we "undo" the read-optimized allocation that was given to the node so
+   that later it can be write-optimized, thus "unpreparing" the flush decision.
